Пример #1
0
    def __init__(self, parseClass):
        self.urls = url_manager.UrlManager()
        self.comms = url_manager.UrlManager()
        self.downloader = html_downloader.HtmlDownloader()
        self.parser = parseClass
        self.outputer = html_outputer.HtmlOutputer()
        self.data_stat = data_stat.DataStat()
        self.count = 1
        self.total = 0
        self.quantity_of_raw_datas = 0
        # self.hp = hpy()
        self.quantity_of_dupli = 0
        self.quantity_of_datas = 0

        #连续出现几个页面没有数据的暂停
        self.nodata = 0
        self.nodata_pages_stop = 5

        #连续出现几个404的暂停
        self.forbidden = 0
        self.forbidden_pages_stop = 2

        # 设置延时
        if 'AjkParser' in str(parseClass):
            self.delay = 3
        elif 'GjParser' in str(parseClass):
            self.delay = 3
        elif 'LjParser' in str(parseClass):
            self.delay = 3
        elif 'WBParser' in str(parseClass):
            self.delay = 0
        elif 'LejuParser' in str(parseClass):
            self.delay = 3
        else:
            self.delay = 0
Пример #2
0
 def __init__(self):
     self.urls = url_manage.UrlManager()
     self.downloader = html_downloader.HtmlDownloader()
     self.parser = html_parser.HtmlParser()
     self.outputer = html_outputer.HtmlOutputer()
     self.titles = []
     self.pictures = []
     self.links = []
Пример #3
0
    def get_pic_url_list(self, key_words, pages):
        url = "http://image.baidu.com/search/acjson"

        downloader = html_downloader.HtmlDownloader()
        urls_list = []
        key_word_list = []
        for key_word, page in zip(key_words, pages):
            params = []
            urls = []
            for i in range(30, 30 * page + 30, 30):
                params.append({
                    'tn': 'resultjson_com',
                    'ipn': 'rj',
                    'ct': 201326592,
                    'is': '',
                    'fp': 'result',
                    'queryWord': key_word,
                    'cl': 2,
                    'lm': -1,
                    'ie': 'utf-8',
                    'oe': 'utf-8',
                    'adpicid': '',
                    'st': -1,
                    'z': '',
                    'ic': 0,
                    'word': key_word,
                    's': '',
                    'se': '',
                    'tab': '',
                    'width': '',
                    'height': '',
                    'face': 0,
                    'istype': 2,
                    'qc': '',
                    'nc': 1,
                    'fr': '',
                    'pn': i,
                    'rn': 30,
                    'gsm': '1e',
                    '1488942260214': ''
                })

            for param in params:
                resp = downloader.get_with_params(url, param)
                json_data_list = json.loads(resp).get('data')
                for json_data in json_data_list:
                    if json_data.get('thumbURL') is not None:
                        urls.append(json_data.get('thumbURL'))

            urls_list.append(urls)
            key_word_list.append(key_word)

        return key_word_list, urls_list
Пример #4
0
    def __init__(self, root_url, proxy_pool, threads):
        self.manager = url_manager.UrlManger()
        self.downloader = html_downloader.HtmlDownloader(proxy_pool)
        self.parser = html_parser.HtmlParser(
            urlparse.urlparse(root_url).hostname)
        self.outputer = html_outputer.HtmlOutputer()

        # self.proxy_pool = proxy_pool

        self.dir = dir_scan.DirScan(proxy_pool,
                                    self.manager.set_protocol(root_url))
        self.cms = cms_scan.CMSScan(proxy_pool)
        self.sqli = sqli_scan.SqliScan(proxy_pool)
        self.xss = xss_scan.XSSScan(proxy_pool)

        self.pool = ThreadPool(threads)
Пример #5
0
 def __init__(self, proxy_pool=None, url=None):
     self.downloader = html_downloader.HtmlDownloader(proxy_pool)
     self.url = url
Пример #6
0
#coding:utf-8
from spider import html_downloader, AJK_parser, mytools
# from bs4 import BeautifulSoup
from lxml import etree
import re
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

downloader = html_downloader.HtmlDownloader()
parser = AJK_parser.AjkParser()
url = 'http://xm.58.com/ershoufang/pn1/'
html_cont = downloader.download(url, False, True)

# 使用lxml解析
sel = etree.HTML(html_cont.encode('utf-8'))

# 解析页码
pages = sel.xpath('//div[@class="pager"]/a/@href')
# for page in pages:
#     print(page)
# soup = BeautifulSoup(html_cont,'lxml',from_encoding='utf-8')

titles = sel.xpath('//h2[@class="title"]/a')
prices = sel.xpath('//p[@class="sum"]/b')
houses = sel.xpath('//div[@class="list-info"]')

i = 1

for title, price, house in zip(titles, prices, houses):
    each_data = {
Пример #7
0
 def __init__(self):
     self.downloader = html_downloader.HtmlDownloader()
     self.parser = blog_sina_parser_by_xpath.PicParserByXpatch()
     self.outputer = spider_outputer.PicOutputer()
     self.ocrer = baidu_aip_ocr.PicOcrer()
Пример #8
0
 def __init__(self):
     self.downloader = html_downloader.HtmlDownloader()
     self.parser = movie_parser_by_xpath.MovieParserByXpatch()
     self.outputer = movie_outputer.MovieOutputer()
Пример #9
0
 def __init__(self):
     self.urls = url_manager.UrlManager()
     self.downloader = html_downloader.HtmlDownloader()
     # self.parser = html_parser.HtmlParser()
     self.parser = html_parser_by_xpath.HtmlParserByXpath()
     self.outputer = spider_outputer.HtmlOutputer()
Пример #10
0
 def __init__(self):
     self.downloader = html_downloader.HtmlDownloader()
     self.parser = baidu_pic_parser.PicParserByXpatch()
     self.outputer = spider_outputer.PicOutputer()
Пример #11
0
 def __init__(self):
     self.urls=url_manager.UrlManger() #初始化url管理器
     self.downloader=html_downloader.HtmlDownloader()#初始化url下载器
     self.parser=html_parser.HtmlParser()#初始化html解析器
     self.outputer=html_output.HtmlOutputer()#初始化html输出器
Пример #12
0
 def __init__(self):  #初始化,
     self.urls = url_manager.UrlManager()  #创建url管理器实例
     self.downloader = html_downloader.HtmlDownloader()  #创建下载器实例
     self.parser = html_parser.HtmlParser()  #创建解析器实例
     self.outputer = html_outputer.HtmlOutputer()  #创建数据输出实例
Пример #13
0
 def __init__(self):
     #初始化各个对象
     self.url = url_manager.UrlManager()
     self.parser = html_parser.HtmlParser()
     self.downloader = html_downloader.HtmlDownloader()
     self.outputer = img_outputer.ImgOutputer()
Пример #14
0
 def __init__(self):
     self.urls = url_manager.UrlManager()
     self.downloader = html_downloader.HtmlDownloader()
     self.parser = html_parser.HtmlParser()
     self.outputer = html_outputer.HtmlOutputer()
     self.connect = connect_mysql.Conenct()
Пример #15
0
 def __init__(self):
     self.urls = url_manager.UrlManager()
     self.downloader = html_downloader.HtmlDownloader()
     self.parser = html_parser.HtmlParser()
     self.outputer = html_outputer.HtmlOutputer()
Пример #16
0
 def __init__(self,proxy_pool):
     self.downloader = html_downloader.HtmlDownloader(proxy_pool)
     self.manager = url_manager.UrlManger()
Пример #17
0
 def __init__(self):
     self.urls = url_manager.UrlManager(
     )  #类的方法也是属性,函数名是一个指向函数的变量,函数赋值给变量,那么变量就指向函数
     self.downloader = html_downloader.HtmlDownloader()
     self.parser = html_parser.HtmlParser()
     self.outputer = html_outputer.HtmlOutputer()