class Spider():     # 定义Spider类
    base_url = 'http://weixin.sogou.com/weixin' # 设置全局变量
    keyword = 'NBA'
    headers = {     # 请求头,浏览器登录账号,开发者工具里将请求头复制出来,带上Cookie字段,才能爬取100页内容
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh=TW;q=0.2,mt;q=0.2',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Cookie': 'IPLOC=CN1100; SUID=6FEDCF3C541C970A000000005968CF55; SUV=1500041046435211;'
          'ABTEST=0|1500041048|v1; SNUID=CEA85AE02A2F7E6EAFF9C1FE2ABEBE6F; weixinIndexVisited=1;'
          'JSESSIONID=aaar_m7LEIW-jg_gikPZv; Id=WKllllllll2BzGMVlllllV0o8cUlllll5G@HbZllll9lllllRklll5'
                  '@@@@@@@@@@',
        'Host': 'weixin.sogou.com',
        'Upgrade-Insecure-Requests':'1',
        'User-Agent': 'Mozilla/5.0(Macintosh; Intel Mac OS X 10_12_3)AppleWebKit/537.36(KHTML, likeGecko)'
          'Chrome/59.0.3071.115 Safari/537.36'
    }
    session = Session() # 初始化Session,执行请求
    queue = RedisQueue()    # 初始化RedisQueue对象,存储请求

    def start(self):
        """
        初始化工作
        :return:
        """
        # 全局更新Headers,使得所有请求都能应用Cookies
        self.session.headers.update(self.headers)
        start_url = self.base_url + '?' + urlencode({'query': self.keyword, 'type': 2})
        weixin_request = WeixinRequest(url=start_url, callback=self.parse_index, need_proxy=True)#改URL构造WeixinRequest
        # 回调函数是Spider类的的parse_index方法,当请求成功后,用parse_index来处理和解析,need_proxy参数设置为True,需要代理
        # 调度第一个请求
        self.queue.add(weixin_request)  # 调用RedisQueue的add方法,将请求加入队列,等待调度
Exemplo n.º 2
0
class Spider():
    base_url = 'http://weixin.sogou.com/weixin'
    keyword = 'NBA'
    headers = {
        'Cache-Control':
        'max-age=0',
        'Connection':
        'keep-alive',
        'Content-Encoding':
        'gzip',
        'Accept':
        'text/html,application/xhtml+xml,application/xml;\
                q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
        'Accept-Encoding':
        'gzip,deflate,br',
        'Accept-Language':
        'zh-CN,zh;q=0.9',
        'Cache-Control':
        'max-age=0',
        'Connection':
        'keep-alive',
        'Cookie':
        'SMYUV=1556302359983345;ABTEST=2|1560665246|v1;IPLOC=CN4201;SUID=ADB93971771A910A000000005D05DC9E;\
                SUID=ADB939712F20910A000000005D05DCA5;weixinIndexVisited=1;sct=1;SNUID=C9DC5D156461EE0FD7DF3DEE65B24370;JSESSIONID=aaaMCpeCkX1I86hi_hiRw',
        'Host':
        'weixin.sogou.com',
        'Upgrade-Insecure-Requests':
        '1',
        'User-Agent':
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) \
            Chrome/74.0.3729.108 Safari/537.36',
    }
    session = Session()
    queue = RedisQueue()

    def start(self):
        """
        初始化工作
        :return:
        """
        # 全局更新Headers
        self.session.headers.update(self.headers)
        start_url = self.base_url + '?' + urlencode({
            'query': self.keyword,
            'type': 2
        })
        weixin_request = WeixinRequest(url=start_url,
                                       callback=self.parse_index,
                                       need_proxy=True)
        # 调度第一个请求
        self.queue.add(weixin_request)
Exemplo n.º 3
0
import pymongo
import threading

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument(
    '--user-data-dir=C:\\Users\\lx\\AppData\\Local\\Google\\Chrome\\User Data')
browser = webdriver.Firefox()
wait = WebDriverWait(browser, 10)

MONGO_URL = 'localhost'
MONGO_DB = 'wenshu'
MONGO_COLLECTION = 'wenshu'
client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]

queue = RedisQueue()
URL = 'http://wenshu.court.gov.cn/list/list/?sorttype=1&number=GKXHF5CE&guid=0aef5a25-a03b-3dfcb27f-7af4bb80f708&conditions=searchWord+2+AJLX++%E6%A1%88%E4%BB%B6%E7%B1%BB%E5%9E%8B:%E6%B0%91%E4%BA%8B%E6%A1%88%E4%BB%B6&conditions=searchWord+%E5%B9%BF%E4%B8%9C%E7%9C%81%E6%B7%B1%E5%9C%B3%E5%B8%82%E4%B8%AD%E7%BA%A7%E4%BA%BA%E6%B0%91%E6%B3%95%E9%99%A2+++%E4%B8%AD%E7%BA%A7%E6%B3%95%E9%99%A2:%E5%B9%BF%E4%B8%9C%E7%9C%81%E6%B7%B1%E5%9C%B3%E5%B8%82%E4%B8%AD%E7%BA%A7%E4%BA%BA%E6%B0%91%E6%B3%95%E9%99%A2'
PAGE = 10


def index_page():
    # 爬取页面的超链接并放入redis,并且点击下一页,(看能否使用多线程,一个爬主页,一个存信息T)
    try:
        url = 'http://wenshu.court.gov.cn/list/list/?sorttype=1&number=GKXHF5CE&guid=0aef5a25-a03b-3dfcb27f-7af4bb80f708&conditions=searchWord+2+AJLX++%E6%A1%88%E4%BB%B6%E7%B1%BB%E5%9E%8B:%E6%B0%91%E4%BA%8B%E6%A1%88%E4%BB%B6&conditions=searchWord+%E5%B9%BF%E4%B8%9C%E7%9C%81%E6%B7%B1%E5%9C%B3%E5%B8%82%E4%B8%AD%E7%BA%A7%E4%BA%BA%E6%B0%91%E6%B3%95%E9%99%A2+++%E4%B8%AD%E7%BA%A7%E6%B3%95%E9%99%A2:%E5%B9%BF%E4%B8%9C%E7%9C%81%E6%B7%B1%E5%9C%B3%E5%B8%82%E4%B8%AD%E7%BA%A7%E4%BA%BA%E6%B0%91%E6%B3%95%E9%99%A2'
        browser.get(url)
        wait.until(
            EC.text_to_be_present_in_element((
                By.CSS_SELECTOR,
                '#resultList > div:nth-child(1) > table > tbody > tr:nth-child(2) > td > div'
            ), '广东'))
        time.sleep(5)
Exemplo n.º 4
0
class Spider():
    count = 1
    base_url = 'http://weixin.sogou.com/weixin'
    proxypool_url = PROXYPOOL_URL
    keyword = KEYWORD
    headers = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding':
        'gzip, deflate',
        'Accept-Language':
        'zh-CN,zh;q=0.9',
        'Cache-Control':
        'max-age=0',
        'Connection':
        'keep-alive',
        'Cookie':
        'ABTEST=5|1532314313|v1; SUID=B50142311F2D940A000000005B5542C9; weixinIndexVisited=1;'
        ' ppinf=5|1532326946|1533536546|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZToxODolRTUlQTUlQkQlRTYl'
        'ODMlQjN8Y3J0OjEwOjE1MzIzMjY5NDZ8cmVmbmljazoxODolRTUlQTUlQkQlRTYlODMlQjN8dXNlcmlkOjQ0Om85dDJsdU1zcUd'
        'LUkZhNFhHUFZDY05WcVg4Nk1Ad2VpeGluLnNvaHUuY29tfA; pprdig=l7bPSuHEX2Dw59St_Hr2jsd9yOCiEQg-SINIWoJwCTf'
        '2NQ4D7oVXLanLnrvYbyRy_v1-ELWd_AxHVeBrAv0m6MNA_sLeRYd4rZK6oGkl7MuMcIwLsO1LNymIEDQyzrO5EKUiD6XDGKr5nS'
        'v3-FK2IT2yKUXHdv_CHpYLO507QTc; sgid=20-36194065-AVtVdCKIibiaGVBqLwI9ItDZU; UM_distinctid=164c81479f'
        '755c-0932f36bc02d29-5b193613-144000-164c81479f8640; CNZZDATA1261666818=1219770327-1532362355-%7C153'
        '2362355; IPLOC=CN3202; SUID=3E0442313108990A000000005B56963C; SUV=00771DED3142043E5B56963D70C16776;'
        ' sct=5; SNUID=142E681B2B2F5B9FDD32825E2B9C0829; JSESSIONID=aaatZ36oaO6QKh8i9qHsw; ppmdig=1532422395'
        '00000000ab72fee141c5ddb0e58074d76e7065',
        'Host':
        'weixin.sogou.com',
        'Upgrade-Insecure-Requests':
        '1',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
        ' Chrome/67.0.3396.99 Safari/537.36'
    }
    session = Session()
    queue = RedisQueue()
    mysql = Mysql()
    mongo = MongoDB()

    #初始化工作
    def start(self):
        #全局更新headers
        self.session.headers.update(self.headers)
        start_url = self.base_url + '?' + urlencode({
            'query': self.keyword,
            'type': 2
        })
        weixin_request = WeixinRequest(url=start_url,
                                       callback=self.parse_index,
                                       need_proxy=True)
        #调度第一个请求
        self.queue.add(weixin_request)

    #开始调度请求
    def schedule(self):
        while not self.queue.empty():
            weixin_request = self.queue.pop()
            callback = weixin_request.callback
            print('Schedule', weixin_request.url)
            response = self.send_request(weixin_request)
            if response and response.status_code == 200:
                results = list(callback(response))
                if results:
                    for result in results:
                        print('New Result', result)
                        if isinstance(result, WeixinRequest):
                            self.queue.add(result)
                        if isinstance(result, dict):
                            #self.mysql.insert(result)
                            self.mongo.insert(result)
                else:
                    print('获得的页面不正确')
                    self.error(weixin_request)
            else:
                self.error(weixin_request)

    #执行请求
    def send_request(self, weixin_request):
        try:
            if weixin_request.need_proxy:
                proxy = self.get_proxy()
                if proxy:
                    proxies = {
                        'http': 'http://' + proxy,
                        'https': 'https://' + proxy
                    }
                    return self.session.send(weixin_request.prepare(),
                                             timeout=weixin_request.timeout,
                                             allow_redirects=False,
                                             proxies=proxies)
            #注意此处一定要允许重定向,否则状态码为301
            return self.session.send(weixin_request.prepare(),
                                     timeout=weixin_request.timeout,
                                     allow_redirects=True)
        except (ConnectionError, ReadTimeout) as e:
            print(e.args)
            return False

    #从代理池获取代理
    def get_proxy(self):
        try:
            response = requests.get(self.proxypool_url)
            if response.status_code == 200:
                print('Get Proxy', response.text)
                return response.text
            return None
        except ConnectionError:
            return None

    #解析索引页
    def parse_index(self, response):
        doc = pq(response.text)
        items = doc('.news-box .news-list li .txt-box h3 a').items()
        for item in items:
            url = item.attr('href')
            weixin_request = WeixinRequest(url=url, callback=self.parse_detail)
            yield weixin_request
        next = doc('#sogou_next').attr('href')
        if self.count < MAX_PAGE:
            if next:
                self.count += 1
                url = self.base_url + next
                weixin_request = WeixinRequest(url=url,
                                               callback=self.parse_index,
                                               need_proxy=True)
                yield weixin_request

    #解析详情页
    def parse_detail(self, response):
        doc = pq(response.text)
        data = {
            'title':
            doc('.rich_media_title').text(),
            'content':
            ''.join((doc('.rich_media_content').text()).split()),
            'date':
            doc('#post-date').text(),
            'nickname':
            doc('#js_profile_qrcode > div > strong').text(),
            'wechat':
            doc('#js_profile_qrcode > div > p:nth-child(3) > span').text()
        }
        yield data

    #错误处理
    def error(self, weixin_request):
        weixin_request.fail_time += 1
        print('Request Failed', weixin_request.fail_time, 'Times',
              weixin_request.url)
        if weixin_request.fail_time < MAX_FAILED_TIME:
            self.queue.add(weixin_request)
Exemplo n.º 5
0
class Spider():
    base_url = 'http://weixin.sogou.com/weixin'
    keyword = '世界杯'
    headers = {
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'en-US,en;q=0.9',
        'Connection': 'keep-alive',
        'Cookie': 'SNUID=ABDB5DA1D1D4A1E78684A865D107AE0F; IPLOC=CN6101; SUID=7A0B8C715F20940A000000005B3D82E1; SUV=1530757856027923; ABTEST=0|1530757927|v1; weixinIndexVisited=1; sct=1; JSESSIONID=aaaEVzTuhjDDbeEJCJgrw; ppinf=5|1530758022|1531967622|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZTo5Ok03NzIzMjkxN3xjcnQ6MTA6MTUzMDc1ODAyMnxyZWZuaWNrOjk6TTc3MjMyOTE3fHVzZXJpZDo0NDpvOXQybHVQbHhIcjJGZEh6UWxtWTk2elNSSzdnQHdlaXhpbi5zb2h1LmNvbXw; pprdig=nllAYaYxssp0hiUDLbEvvzmxf01k-Yp_ap-DE9ySNTT_ml1urWFbceFAl3tDw8mIzO-xRANMxd1RyOjH4hBYnHTtdad7i4cMcKCToqIkuNgoVg-v8hRMUAthv-42GI5QRC3QD5j-jVdSJ26-_0xZfS2YrhmYnKXvtpItdZpUI6I; sgid=13-35854231-AVs9g4blTLWoo7vKJzNYu4g; ppmdig=15307580220000006976bdc9e221344757bb35c7bc99b93f',
        'Host': 'weixin.sogou.com',
        'Referer': 'http://weixin.sogou.com/weixin?query=世界杯&_sug_type_=&sut=6402&lkt=4%2C1530757936771%2C1530757943164&s_from=input&_sug_=y&type=2&sst0=1530757943266&page=1&ie=utf8&w=01019900&dr=1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
    }
    session = Session()
    queue = RedisQueue()
    mysql = MySQL()

    #获取随机代理
    # PROXY_POOL_URL = 'http://127.0.0.1:5555/random'
    def get_proxy(self):
        try:
            response = requests.get(PROXY_POOL_URL)
            if response.status_code == 200:
                print('Get proxy', response.text)
                return response.text
            return None
        except requests.ConnectionError:
            return None

    def start(self):
        #全局更新Headers
        self.session.headers.update(self.headers)
        start_url = self.base_url + '?' + urlencode({'query': self.keyword, 'type': 2})
        weixin_request = WeixinRequest(url=start_url, callback=self.parse_index, need_proxy=True)
        #调度第一个请求
        self.queue.add(weixin_request)
    
    # from pyquery import Pyquery as pq
    #回调函数,解析索引页
    def parse_index(self, response):
        doc = pq(response.text)
        items = doc('.news-box .news-list li .txt-box h3 a').items()
        #本页所有的链接
        for item in items():
            url = item.attr('href')
            weixin_request = WeixinRequest(url=url, callback=self.parse_detail)
            yield weixin_request
        next = doc('#sogou_next').attr('href')
        #下一页
        if next:
            url = self.base_url + str(next)
            weixin_request = WeixinRequest(url=url, callback=self.parse_index, need_proxy=True)
            yield weixin_request

    #解析详情页
    def parse_detail(self, response):
        doc = pq(response.text)
        data = {
            'title': doc('.rich_media_title').text(),
            'content': doc('.rich_media_content').text(),
            'date': doc('#publish_time').text(),
            'wechat': doc('#js_profile_qrcode > div > p:nth-child(3) > span').text(),
            'nickname': doc('#js_profile_qrcode > div > strong').text()
        }
        yield data

    # from requests import ReadTimeout, ConnectionError
    def request(self, weixin_request):
        try:
            #判断是否需要代理
            if weixin_request.need_proxy:
                #获取代理
                proxy = self.get_proxy()
                if proxy:
                    proxies = {
                        'http': 'http://' + proxy,
                        'https': 'https://' + proxy
                    }
                    #调用Session的send()方法执行请求,请求调用了prepare()方法转化为Prepared Request
                    return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False, proxies=proxies)
            return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False)
        except (ConnectionError, ReadTimeout) as e:
            print(e.args)
            return False

    #错误处理
    def error(self, weixin_request):
        weixin_request.fail_time = weixin_request.fail_time + 1
        print('Request Faild', weixin_request.fail_time, 'Times', weixin_request.url)
        if weixin_request.fail_time < MAX_FAILED_TIME:
            self.queue.add(weixin_request)

    #调度请求
    # VALID_STATUES = [200]
    def schedule(self):
        while not self.queue.empty():
            weixin_request = self.queue.pop()
            callback = weixin_request.callback
            print('Schedule', weixin_request.url)
            response = self.request(weixin_request)
            if response and response.status_code in VALID_STATUSES:
                results = list(callback(response))
                if results:
                    for result in results:
                        print('New Result', result)
                        if isinstance(result, WeixinRequest):
                            self.queue.add(result)
                        if isinstance(result, dict):
                            self.mysql.insert('articles', result)
                else:
                    self.error(weixin_request)
            else:
                self.error(weixin_request)

    #入口
    def run(self):
        self.start()
        self.schedule()
Exemplo n.º 6
0
class Spider():
    base_url = 'https://weixin.sogou.com/weixin?'
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36',
        'Host': 'weixin.sogou.com',
        'Cookie':
        'CXID=DC6BC45CC7C377F723DD3A443FE31E4C; SUID=173F49DF3565860A5D15F0D5000657EA; wuid=AAGPCPd4KAAAAAqLFBtmTQ0AGwY=; SUV=008FECF4DF4AD4815D27703DD6AB0621; ABTEST=6|1565489065|v1; IPLOC=CN4451; weixinIndexVisited=1; JSESSIONID=aaaMUF5foWevaeAF9kpXw; sct=3; ppinf=5|1566048243|1567257843|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZToxMTplY2hveHh6aGFuZ3xjcnQ6MTA6MTU2NjA0ODI0M3xyZWZuaWNrOjExOmVjaG94eHpoYW5nfHVzZXJpZDo0NDpvOXQybHVFa1JhOHg0VlN1akEyVWFfWm5jLUlRQHdlaXhpbi5zb2h1LmNvbXw; pprdig=DBBFR2RZZygrUSj6gi2wMUB1mVAlkg1-pTbbkU6YY8rwyrLS6hOyRYi6q8XatVyTLI17Yow3q-RsViuhloTsy7OAJSg2B0PDjiWmSpT53CWn12TzQvcHyhBz9CboxgT-HyjGlyaHZXc5_nX2IY5O8daePYZ5OTh_8FUwTbElygg; sgid=29-42669099-AV1XicicOCeeX2kdrUcM6AiaTo; ppmdig=15660482440000000132fe6a9586fc74c7d7bd1e47980e9d; PHPSESSID=m6o5qn0mr83kplu2nbhu20i2j0; SNUID=3EBF21B46A6EFB3A61BBFA7F6BB0F6F8; successCount=1|Sat, 17 Aug 2019 13:30:39 GMT',
        'Connection': 'keep-alive'
    }
    session = requests.Session()
    queue = RedisQueue()
    mysql = MySQL()

    def get_proxy(self):
        '''
        获取随机代理
        :return: 代理的IP
        '''
        try:
            response = requests.get(PROXY_POOL_URL)
            if response.status_code == 200:
                print('获取到代理', response.text)
                return response.text
        except requests.ConnectionError:
            print('出错了')
            return None

    def start(self):
        self.session.headers.update(self.headers)  # 更新 headers 参数
        start_url = self.base_url + urlencode(the_dict)  # 拼接url
        weixin_request = WeixinRequest(url=start_url,
                                       callback=self.parse_index,
                                       need_proxy=True)

        self.queue.add(weixin_request)
        # 将请求对象添加到队列中

    def parse_index(self, response):
        '''
        解析索引页
        :param response: 响应
        :return: 新的响应
        '''
        doc = pq(response.text)
        items = doc('.news-box .news-list li .txt-box h3 a').items()
        for item in items:
            url = item.attr('href')
            weixin_request = WeixinRequest(url=url, callback=self.parse_detail)
            # 列表页的详情页链接
            yield weixin_request
        next = doc('#sogou_next').attr('href')
        if next:
            url = self.base_url + str(next)  # 拼接下一页的URL
            weixin_request = WeixinRequest(url=url,
                                           callback=self.parse_index,
                                           need_proxy=True)
            # 下一页的详情页链接
            yield weixin_request

    def parse_detail(self, response):
        '''
        解析详情页
        :param response: 响应
        :return: 微信公众号文章
        '''
        doc = pq(response.text)
        data = {
            'title':
            doc('.rich_media_title').text(),
            'content':
            doc('.rich_media_content').text(),
            'date':
            doc('#post-date').text(),
            'nickname':
            doc('#js_profile_qrcode > div > strong').text(),
            'wechat':
            doc('#js_profile_qrcode > div > p:nth-child(3) > span').text()
        }
        yield data

    def request(self, weixin_request):
        '''
        执行请求的具体方法
        :param weixin_request:
        :return: 响应
        '''

        try:
            if weixin_request.need_proxy:
                # 实例化对象时,设置是否需要代理,如果是,进入这个控制语句。差别在于是否允许重定向
                proxy = self.get_proxy()
                if proxy:
                    proxies = {
                        'http:': 'http://' + proxy,
                        'https': 'https://' + proxy
                    }
                    return self.session.send(weixin_request.prepare(),
                                             timeout=weixin_request.timeout,
                                             allow_redirects=False,
                                             proxies=proxies)

            return self.session.send(weixin_request.prepare(),
                                     timeout=weixin_request.timeout,
                                     allow_redirects=False)
        except (ConnectionError, ReadTimeout) as e:
            print(e.args)
            return False

    def error(self, weixin_request):
        '''
        错误处理
        :param weixin_request:
        :return:
        '''
        weixin_request.fail_time = weixin_request.fail_time + 1
        # 增加重试时间
        print('Request Failed', weixin_request.fail_time, 'Times',
              weixin_request.url)
        if weixin_request.fail_time < MAX_FAILED_TIME:
            # 如果重试少于最大限制,重新加回对列
            self.queue.add(weixin_request)

    def schedule(self):
        '''
        调度模块
        :return:
        '''
        while not self.queue.empty():  # 如果列表长度不为空
            weixin_request = self.queue.pop()  # 取出一个
            callback = weixin_request.callback
            # callback 被定义为解析列表页的方法

            print(weixin_request.url)
            # print('正在调度:', weixin_request)
            response = self.request(weixin_request)
            print(response)
            if response and response.status_code in VALID_STATUSES:
                results = list(callback(response))
                # 解析列表页
                if results:
                    for result in results:
                        print('New Result', type(result))
                        if isinstance(result, WeixinRequest):
                            self.queue.add(result)
                        if isinstance(result, dict):
                            self.mysql.insert('articles', result)
            else:
                # 没有内容,报出错误
                self.error(weixin_request)

    def run(self):
        '''
        函数入口
        :return:
        '''
        self.start()
        self.schedule()
Exemplo n.º 7
0
class Wechat(object):
    base_url = "http://weixin.sogou.com/weixin"
    keyword = "nba"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.1 Safari/605.1.15',
        "Cookie": 'sct=2; ppmdig=153798156900000029b2c90dd10104a88b02afad382cf017; ppinf=5|1537799457|1539009057|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZToxMTptcmxvbmVseWp0cnxjcnQ6MTA6MTUzNzc5OTQ1N3xyZWZuaWNrOjExOm1ybG9uZWx5anRyfHVzZXJpZDo0NDpvOXQybHVGUEpyZHBxQ0dWR09VdG4xb1ZveU5nQHdlaXhpbi5zb2h1LmNvbXw; pprdig=YrgiDGJBorgcLoekzZXOYzQGDDiQ1I06___HgQ82ouWk1pkxaD2ON7U0nMVTZ7cKn6QkbWSGMo2frdWi81FXGx76xMXLPID5wg-hMXm2x9GKImc75S2POjyaM1ybQGA8ICZmUgUcOJt2hIfQCPulkQjE2rGEOPp6zbBzPCvXinE; sgid=02-35144799-AVuo9SEdkhQRYFdDtzBL72k; SNUID=3A94A629F3F6856D4D4A75AAF413208B; JSESSIONID=aaadNfym5dl1OgUejcHvw; SUV=006425A0DA5267C95BA8F38BC42D4423; IPLOC=CN3100; SUID=C96752DA2C12960A000000005BA8F38A; SUID=C96752DA3F18960A000000005BA8F389; ABTEST=0|1537799049|v1; weixinIndexVisited=1',
        "Host": 'weixin.sogou.com',
        "Upgrade-Insecure-Requests": 1,
        "Accept": 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        "Accept-Encoding": 'gzip, deflate',
        "Accept-Language": 'zh-CN,zh;q=0.9',
        "Cache-Control": 'no-cache',
        "Connection": 'keep-alive'
    }

    session = Session()
    queue = RedisQueue()
    mysql = MySQL()

    def get_proxy(self):
        try:
            response = requests.get(PROXY_POOL_URL)

            if response.status_code == 200:
                print("Get Proxy", response.text)
                return response.text
            return None
        except ConnectionError:
            return None

    def start(self):
        self.session.headers.update(self.headers)
        start_url = self.base_url + "?" + urlencode({"query": self.keyword, "type": 2})
        weixin_request = WeixinRequest(url=start_url, callback=self.parse_index, need_proxy=True)
        self.queue.add(weixin_request)
    
    def schedule(self):
        while not self.queue.empty():
            weixin_request = self.queue.pop()
            callback = weixin_request.callback
            print("Schedule", weixin_request.url)
            response = self.request(weixin_request)

            if response and response.status_code in VALID_STATUSES:
                results = list(callback(response))

                if results:
                    for result in results:
                        print("New Result", type(result))

                        if isinstance(result, WeixinRequest):
                            self.queue.add(result)
                        if isinstance(result, dict):
                            self.mysql.insert("articles", result)
                else:
                    self.error(weixin_request)
            else:
                self.error(weixin_request)

            sys.stdout.flush()
    
    def request(self, weixin_request):
        try:
            if weixin_request.need_proxy:
                proxy = self.get_proxy()

                if proxy:
                    proxies = {
                        "http": "http://" + proxy,
                        "https": "https://" + proxy
                    }
                    return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, proxies=proxies)
            return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout)
        except (ConnectionError, ReadTimeout) as e:
            print(e.args)
            return False
    
    def parse_index(self, response):
        doc = pq(response.text)
        items = doc(".news-list .txt-box h3 a").items()

        for item in items:
            url = item.attr("href")
            yield WeixinRequest(url=url, callback=self.parse_detail)
        
        page = doc("#sogou_next").attr("href")

        if page:
            url = self.base_url + str(page)
            yield WeixinRequest(url=url, callback=self.parse_index, need_proxy=True)

    def parse_detail(self, response):
        doc = pq(response.text)
        yield {
            'title': doc('.rich_media_title').text(),
            'content': doc('.rich_media_content').text(),
            'date': re.search('publish_time = "(.*?)"', response.text).group(1),
            'nickname': doc('#js_profile_qrcode > div > strong').text(),
            'wechat': doc('#js_profile_qrcode > div > p:nth-child(3) > span').text()
        }
    
    def error(self, weixin_request):
        weixin_request.fail_time = weixin_request.fail_time + 1
        print('Request Failed', weixin_request.fail_time, 'Times', weixin_request.url)
        
        if weixin_request.fail_time < MAX_FAILED_TIME:
            self.queue.add(weixin_request)
    
    def run(self):
        self.start()
        self.schedule()
Exemplo n.º 8
0
class Spider():
    base_url = 'http://weixin.sogou.com/weixin?type=2&s_from=input&query='
    keyword = 'NBA'
    headers = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding':
        'gzip, deflate',
        'Accept-Language':
        'zh-CN,zh;q=0.9',
        'Cache-Control':
        'max-age=0',
        'Connection':
        'keep-alive',
        'Cookie':
        'CXID=D4E8EAA2564D90FBC285BAECF3078601; SUID=AEF72BAB5E68860A5B25128E00099457; SUV=00A00B5AAB2BF7AE5B2512C0E549F141; ad=Plllllllll2bJS3nlllllV7f1w9lllll$hsTrkllll9llllllZlll5@@@@@@@@@@; IPLOC=CN4403; ABTEST=0|1532415249|v1; SNUID=ABF409F180850E3945E004EF802B5D16; weixinIndexVisited=1; JSESSIONID=aaav6mr8OopuEawYdpHsw; ppinf=5|1532422896|1533632496|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZTozOk5HVXxjcnQ6MTA6MTUzMjQyMjg5NnxyZWZuaWNrOjM6TkdVfHVzZXJpZDo0NDpvOXQybHVHN1RDWjZBU0E0TjgzM2JvaGs5aGJFQHdlaXhpbi5zb2h1LmNvbXw; pprdig=UAPBSEKKv9ua27yywPBP0BKxd4FAEtELVT8yK7dxy7N57B3yS-PA3M-C3d-VEOBxc-N-IIRP7khJM3Amnnol_WBt5RTD-V0pgVuxqNVf0EqfwLJwDWkiI3OA0-rCrBJrdOnCK0vj3IZvheDE1yjLjv-mdw0tv5MSeqlFOWZyhPk; sgid=22-36203267-AVtW6vAowj2ViaOuOVGM5Mto; ppmdig=153242289600000060a71b68c3ad711211acdaf46fde1281',
        'Host':
        'weixin.sogou.com',
        'Upgrade-Insecure-Requests':
        '1',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
    }
    session = Session()
    queue = RedisQueue()
    mysql = MySQL()

    def get_proxy(self):
        """
        从代理池获取代理
        :return:
        """
        try:
            response = requests.get(PROXY_POOL_URL)
            if response.status_code == 200:
                print('Get Proxy', response.text)
                return response.text
            return None
        except requests.ConnectionError:
            return None

    def start(self):
        """
        初始化工作
        """
        # 全局更新Headers

        self.session.headers.update(self.headers)
        start_url = self.base_url + self.keyword
        weixin_request = WeixinRequest(url=start_url,
                                       callback=self.parse_index,
                                       need_proxy=True)
        # 调度第一个请求
        self.queue.add(weixin_request)

    def parse_index(self, response):
        """
        解析索引页
        :param response: 响应
        :return: 新的响应
        """
        doc = pq(response.text)
        items = doc('.news-box .news-list li .txt-box h3 a').items()
        for item in items:
            url = item.attr('href')
            weixin_request = WeixinRequest(url=url, callback=self.parse_detail)
            yield weixin_request
        next = doc('#sogou_next').attr('href')
        if next:
            url = self.base_url + str(next)
            weixin_request = WeixinRequest(url=url,
                                           callback=self.parse_index,
                                           need_proxy=True)
            yield weixin_request

    def parse_detail(self, response):
        """
        解析详情页
        :param response: 响应
        :return: 微信公众号文章
        """
        doc = pq(response.text)
        data = {
            'title':
            doc('.rich_media_title').text(),
            'content':
            doc('.rich_media_content').text(),
            #'date': doc('#post-date').text(),
            'nickname':
            doc('#js_profile_qrcode > div > strong').text(),
            'wechat':
            doc('#js_profile_qrcode > div > p:nth-child(3) > span').text()
        }
        yield data

    def request(self, weixin_request):
        """
        执行请求
        :param weixin_request: 请求
        :return: 响应
        """
        try:
            if weixin_request.need_proxy:
                proxy = self.get_proxy()
                if proxy:
                    proxies = {
                        'http': 'http://' + proxy,
                        'https': 'https://' + proxy
                    }
                    return self.session.send(weixin_request.prepare(),
                                             timeout=weixin_request.timeout,
                                             allow_redirects=False,
                                             proxies=proxies)
            return self.session.send(weixin_request.prepare(),
                                     timeout=weixin_request.timeout,
                                     allow_redirects=False)
        except (ConnectionError, ReadTimeout) as e:
            print(e.args)
            return False

    def error(self, weixin_request):
        """
        错误处理
        :param weixin_request: 请求
        :return:
        """
        weixin_request.fail_time = weixin_request.fail_time + 1
        print('Request Failed', weixin_request.fail_time, 'Times',
              weixin_request.url)
        if weixin_request.fail_time < MAX_FAILED_TIME:
            self.queue.add(weixin_request)

    def schedule(self):
        """
        调度请求
        :return:
        """
        while not self.queue.empty():
            weixin_request = self.queue.pop()
            callback = weixin_request.callback
            print('Schedule', weixin_request.url)
            response = self.request(weixin_request)
            if response and response.status_code in VALID_STATUSES:
                results = list(callback(response))
                if results:
                    for result in results:
                        print('New Result', type(result))
                        if isinstance(result, WeixinRequest):
                            self.queue.add(result)
                        if isinstance(result, dict):
                            self.mysql.insert('articles', result)
                else:
                    self.error(weixin_request)
            else:
                self.error(weixin_request)

    def run(self):
        """
        入口
        :return:
        """
        self.start()
        self.schedule()
Exemplo n.º 9
0
class Spider():
    base_url = 'http://weixin.sogou.com/weixin'
    keyword = 'NBA'
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Cookie': 'SUID=329D9B273865860A598BD206000E05BC; ABTEST=0|1536152449|v1; IPLOC=CN1100; weixinIndexVisited=1; SUV=00334F7972F4B9F35B8FD385513C8523; SNUID=9111B04D3832425DCA568C3938B9B969; sct=2; JSESSIONID=aaaJrY-Zle4RBb7vUTBvw',
        'Host': 'weixin.sogou.com',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
    }
    session = Session()
    queue = RedisQueue()
    mysql = Mysql()
    def get_proxy(self):
        '''
        获取随机代理
        :return: 随机代理
        '''
        proxy_url = 'http://127.0.0.1:8080/random'
        try:
            response = requests.get(proxy_url)
            if response.status_code == 200:
                print("GET Proxy:",response.text)
                return response.text
            return None
        except requests.ConnectionError:
            return None
    def start(self):
        '''
        开始第一个请求
        :return:
        '''
        self.session.headers.update(self.headers)
        start_url = self.base_url + '?' + urlencode({'type':'2','query':self.keyword})
        weixin_request = WeixinRequest(start_url,self.parse_index,need_proxy=True)
        self.queue.add(weixin_request)
    def parse_index(self,response):
        '''
        解析索引页
        :param response: 响应
        :return: 新的请求
        '''
        doc = pq(response.text)
        items = doc('.news-box .news-list li txt-box h3 a').items()
        for item in items:
            url = item.attr('href')
            weixin_request = WeixinRequest(url=url,callback=self.parse_detail)
            yield weixin_request
        # next = doc('#sogou_next')
        # if next:
        #     url = self.base_url + next.attr('href')
        #     weixin_request = WeixinRequest(url=url,callback=self.parse_index,need_proxy=True)
        #     yield weixin_request
    def parse_detail(self,response):
        '''
        解析详情页
        :param response: 响应
        :return: 微信公众号文章
        '''
        doc = pq(response.text)
        data = {
            'title':doc('.rich_media_title').text(),
            # 'content':doc('.rich_media_content').text(),
            'date':doc('.rich_media_meta_list #publish_time').text(),
            'nickname':doc('.rich_media_meta_list .rich_media_meta_text').text(),
            'wechat':doc('.rich_media_meta_list .rich_media_meta_nickname').text()
        }
        print(data)
        yield data
    def request(self,weixin_request):
        '''
        执行请求
        :param weixin_request: 请求
        :return: 响应
        '''
        # try:
        if weixin_request.need_proxy:
            proxy = self.get_proxy()
            if proxy:
                proxies = {
                    'http':'http://' + proxy,
                    'https':'http://'+proxy
                }
                return self.session.send(weixin_request.prepare(),timeout=weixin_request.timeout,
                                         allow_redirects=False,proxies=proxies,verify=False)
        return self.session.send(weixin_request.prepare(),timeout=weixin_request.timeout,
                                 allow_redirects=False,verify=False)
        # except Exception as e:
        #     print(e.args)
        #     return False

    def error(self,weixin_request):
        weixin_request.fail_time += 1
        print('Request Failed',weixin_request.fail_time,'Times',weixin_request.url)
        if weixin_request.fail_time < MAX_FAILED_TIME:
            self.queue.add(weixin_request)
    def schedule(self):
        while not self.queue.empty():
            weixin_request = self.queue.pop()
            callback = weixin_request.callback
            print("Schedule",weixin_request.url)
            response = self.request(weixin_request)
            print(response)
            if response and response.status_code in VALID_STATUSES:
                results = list(callback(response))
                if results:
                    for result in results:
                        if isinstance(result,WeixinRequest):
                            self.queue.add(result)
                        if isinstance(result,dict):
                            self.mysql.insert('articles',result)
                else:
                    self.error(weixin_request)
                    print(response)
            else:
                self.error(weixin_request)
    def run(self):
        self.start()
        self.schedule()
Exemplo n.º 10
0
class Spider:
    base_url = "https://tiam.jp/post_ranking"
    session = Session()
    queue = RedisQueue()
    mysql = MySQL()
    #每次开启清除之前的队列
    queue.clear()

    def get_proxy(self):
        """
        从代理池获取代理
        :return:
        """
        try:
            response = requests.get(PROXY_POOL_URL)
            if response.status_code == 200:
                print("Get Proxy", response.text)
                return response.text
            return None
        except requests.ConnectionError:
            return None

    def start(self):
        """
        初始化工作
        """
        start_url = self.base_url
        print(start_url)
        weixin_request = WeixinRequest(url=start_url,
                                       callback=self.parse_index,
                                       need_proxy=False)
        # 调度第一个请求
        self.queue.add(weixin_request)

    def parse_index(self, response):
        """
        解析索引页
        :param response: 响应
        :return: 新的响应
        """
        doc = pq(response.text)
        items = doc('.row.hsd-article').items()
        for item in items:
            url = item('a').attr('href')
            weixin_request = WeixinRequest(url=url,
                                           callback=self.parse_detail,
                                           need_proxy=True)  #请求详情页不需要代理ip
            yield weixin_request
        next = doc('.pager_right a').attr('href')
        if next:
            url = next
            weixin_request = WeixinRequest(
                url=url,
                callback=self.parse_index,
                need_proxy=True  #请求列表页需要代理ip
            )
            yield weixin_request

    def parse_detail(self, response):
        """
        解析详情页
        :param response: 响应
        :return: 微信公众号文章
        """
        doc = pq(response.text)
        data = {
            'actor': doc('div.thin_margin').text().split(' ')[0],
            'title': doc('.hsd-article-title').text().strip(),
            'video_url': doc('.hsd-video').attr('src'),
            'click_times': doc('.date.low').text().split(' ')[0],
            'release_date': doc('.date.low').text().split(' ')[1],
            'category': doc('div.thin_margin').text(),
            'image_url': doc('.hsd-product-detail-img').attr('src'),
            'info': doc('.hsd-lead-info').text(),
        }
        yield data

    def request(self, weixin_request):
        """
        执行请求
        :param weixin_request: 请求
        :return: 响应
        """
        try:
            if weixin_request.need_proxy:
                proxy = self.get_proxy()
                if proxy:
                    proxies = {
                        "http": "http://" + proxy,
                        "https": "https://" + proxy
                    }
                    return self.session.send(
                        weixin_request.prepare(),
                        timeout=weixin_request.timeout,
                        allow_redirects=False,
                        proxies=proxies,
                    )
            return self.session.send(
                weixin_request.prepare(),
                timeout=weixin_request.timeout,
                allow_redirects=False,
            )
        except (ConnectionError, ReadTimeout) as e:
            print(e.args)
            return False

    def error(self, weixin_request):
        """
        错误处理
        :param weixin_request: 请求
        :return:
        """
        weixin_request.fail_time = weixin_request.fail_time + 1
        print("Request Failed", weixin_request.fail_time, "Times",
              weixin_request.url)
        if weixin_request.fail_time < MAX_FAILED_TIME:
            self.queue.add(weixin_request)

    def schedule(self):
        """
        调度请求
        :return:
        """
        while not self.queue.empty():
            weixin_request = self.queue.pop()
            callback = weixin_request.callback
            print("Schedule", weixin_request.url)

            response = self.request(weixin_request)
            if response and response.status_code in VALID_STATUSES:
                results = list(callback(response))
                if results:
                    for result in results:
                        print("New Result", type(result))
                        if isinstance(result, WeixinRequest):
                            self.queue.add(result)
                        if isinstance(result, dict):
                            self.mysql.insert("tiam", result)
                else:
                    self.error(weixin_request)
            else:
                self.error(weixin_request)

    def run(self):
        """
        入口
        :return:
        """
        self.start()

        self.schedule()
Exemplo n.º 11
0
class Spider():
    base_url = 'http://weixin.sogou.com/weixin'
    keyword = 'NBA'
    headers = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding':
        'gzip, deflate',
        'Accept-Language':
        'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2,mt;q=0.2',
        'Cache-Control':
        'max-age=0',
        'Connection':
        'keep-alive',
        'Cookie':
        'IPLOC=CN2102; SUID=9C73756F1F13940A000000005B1F2906; SUV=00F0227D6F75739C5B1F290662C32918; ABTEST=2|1529889370|v1; SNUID=345679DA0F0A7F62492989F810AAD276; weixinIndexVisited=1; ld=Jyllllllll2bG93illlllV7Zyr6lllllhXBpJlllll9lllllpylll5@@@@@@@@@@; LSTMV=0%2C0; LCLKINT=227; ppinf=5|1529891807|1531101407|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZTo5OiVFNyVBRCVCMXxjcnQ6MTA6MTUyOTg5MTgwN3xyZWZuaWNrOjk6JUU3JUFEJUIxfHVzZXJpZDo0NDpvOXQybHVHN2YxNzhjV2RRTS1FeFhqd0pTQnF3QHdlaXhpbi5zb2h1LmNvbXw; pprdig=mFU0I21BkcfTms9E36aUuv3z00DUZRBsL1tvbKyCkCzCpVhieM2YG-Q-zCFrX0Yp0hRzOMcLitg5lbi5rl4TfZb-voi6cug7on20l2thl5uYMoEwH_5NS9dEKUb4gYsW53BhZMc8g1Ceo2qR1mLpHzppFvMPZcnYsiqjTN6n4Ok; sgid=07-33580423-AVswS994vPgzNvLqqhTK3dQ; SUIR=CFAD9D26ECE98481C86D510FEC971D63; sct=4; ppmdig=1530100773000000c8ea0ae361dd1b6b84de1fec39f294b0;ad=nk91vlllll2b3z6OlllllV7ZyrolllllhXBpIkllll9lllllpylll5@@@@@@@@@@,CXID=26EC71D83938ABA9225220155EDA075A,ad=nk91vlllll2b3z6OlllllV7ZyrolllllhXBpIkllll9lllllpylll5@@@@@@@@@@, GOTO=Af99047,usid=ESlNvFQz9-gunhZe',
        'Host':
        'weixin.sogou.com',
        'Upgrade-Insecure-Requests':
        '1',
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
    }
    session = Session()
    queue = RedisQueue()
    mysql = MySQL()

    def get_proxy(self):
        """
        从代理池获取代理
        :return: 代理ip及端口
        """
        try:
            response = requests.get(PROXY_POOL_URL)
            if response.status_code == 200:
                print('Get Proxy', response.text)
                return response.text
            return None
        except requests.ConnectionError:
            return None

    def start(self):
        """
        初始化工作
        """
        # 全局更新Headers
        self.session.headers.update(self.headers)
        start_url = self.base_url + '?' + urlencode({
            'query': self.keyword,
            'type': 2
        })
        weixin_request = WeixinRequest(url=start_url,
                                       callback=self.parse_index,
                                       need_proxy=True)
        # 调度第一个请求
        self.queue.add(weixin_request)

    def parse_index(self, response):
        """
        解析索引页
        :param response: 响应
        :return: 新的响应
        """
        doc = pq(response.text)
        items = doc('.news-box .news-list li .txt-box h3 a').items()
        for item in items:
            url = item.attr('href')
            weixin_request = WeixinRequest(url=url, callback=self.parse_detail)
            yield weixin_request
        next = doc('#sogou_next').attr('href')
        if next:
            url = self.base_url + str(next)
            weixin_request = WeixinRequest(url=url,
                                           callback=self.parse_index,
                                           need_proxy=True)
            yield weixin_request

    def parse_detail(self, response):
        """
        解析详情页
        :param response: 响应
        :return: 微信公众号文章
        """
        doc = pq(response.text)
        data = {
            'title':
            doc('.rich_media_title').text(),
            'content':
            doc('.rich_media_content').text(),
            'date':
            doc('#post-date').text(),
            'nickname':
            doc('#js_profile_qrcode > div > strong').text(),
            'wechat':
            doc('#js_profile_qrcode > div > p:nth-child(3) > span').text()
        }
        yield data

    def request(self, weixin_request):
        """
        执行请求
        :param weixin_request: 请求
        :return: 响应
        """
        try:
            if weixin_request.need_proxy:
                proxy = self.get_proxy()
                if proxy:
                    proxies = {
                        'http': 'http://' + proxy,
                        'https': 'https://' + proxy
                    }
                    return self.session.send(weixin_request.prepare(),
                                             timeout=weixin_request.timeout,
                                             allow_redirects=False,
                                             proxies=proxies)
            return self.session.send(weixin_request.prepare(),
                                     timeout=weixin_request.timeout,
                                     allow_redirects=False)
        except (ConnectionError, ReadTimeout) as e:
            print(e.args)
            return False

    def error(self, weixin_request):
        """
        错误处理
        :param weixin_request: 请求
        :return:
        """
        weixin_request.fail_time = weixin_request.fail_time + 1
        print('Request Failed', weixin_request.fail_time, 'Times',
              weixin_request.url)
        if weixin_request.fail_time < MAX_FAILED_TIME:
            self.queue.add(weixin_request)

    def schedule(self):
        """
        调度请求
        :return:
        """
        while not self.queue.empty():
            weixin_request = self.queue.pop()
            callback = weixin_request.callback
            print('Schedule', weixin_request.url)
            response = self.request(weixin_request)
            if response and response.status_code in VALID_STATUSES:
                results = list(callback(response))
                if results:
                    for result in results:
                        print('New Result', type(result))
                        if isinstance(result, WeixinRequest):
                            self.queue.add(result)
                        if isinstance(result, dict):
                            self.mysql.insert('articles', result)
                else:
                    self.error(weixin_request)
            else:
                self.error(weixin_request)

    def run(self):
        """
        入口
        :return:
        """
        self.start()
        self.schedule()
Exemplo n.º 12
0
class Spider():
    base_url = 'http://weixin.sogou.com/weixin'
    keyword = 'NBA'
    headers = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding':
        'gzip, deflate',
        'Accept-Language':
        'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2,mt;q=0.2',
        'Cache-Control':
        'max-age=0',
        'Connection':
        'keep-alive',
        'Cookie':
        'IPLOC=CN1100; SUID=6FEDCF3C541C940A000000005968CF55; SUV=1500041046435211; ABTEST=0|1500041048|v1; SNUID=CEA85AE02A2F7E6EAFF9C1FE2ABEBE6F; weixinIndexVisited=1; JSESSIONID=aaar_m7LEIW-jg_gikPZv; ld=Wkllllllll2BzGMVlllllVOo8cUlllll5G@HbZllll9lllllRklll5@@@@@@@@@@; LSTMV=212%2C350; LCLKINT=4650; ppinf=5|1500042908|1501252508|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZTo1NDolRTUlQjQlOTQlRTUlQkElODYlRTYlODklOEQlRTQlQjglQTglRTklOUQlOTklRTglQTclODV8Y3J0OjEwOjE1MDAwNDI5MDh8cmVmbmljazo1NDolRTUlQjQlOTQlRTUlQkElODYlRTYlODklOEQlRTQlQjglQTglRTklOUQlOTklRTglQTclODV8dXNlcmlkOjQ0Om85dDJsdUJfZWVYOGRqSjRKN0xhNlBta0RJODRAd2VpeGluLnNvaHUuY29tfA; pprdig=ppyIobo4mP_ZElYXXmRTeo2q9iFgeoQ87PshihQfB2nvgsCz4FdOf-kirUuntLHKTQbgRuXdwQWT6qW-CY_ax5VDgDEdeZR7I2eIDprve43ou5ZvR0tDBlqrPNJvC0yGhQ2dZI3RqOQ3y1VialHsFnmTiHTv7TWxjliTSZJI_Bc; sgid=27-27790591-AVlo1pzPiad6EVQdGDbmwnvM; PHPSESSID=mkp3erf0uqe9ugjg8os7v1e957; SUIR=CEA85AE02A2F7E6EAFF9C1FE2ABEBE6F; sct=11; ppmdig=1500046378000000b7527c423df68abb627d67a0666fdcee; successCount=1|Fri, 14 Jul 2017 15:38:07 GMT',
        'Host':
        'weixin.sogou.com',
        'Upgrade-Insecure-Requests':
        '1',
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
    }
    session = Session()
    queue = RedisQueue()
    mysql = MySQL()

    def get_proxy(self):
        """
        从代理池获取代理
        :return:
        """
        try:
            response = requests.get(PROXY_POOL_URL)
            if response.status_code == 200:
                print("Get Proxy", response.text)
                return response.text
            return None
        except requests.ConnectionError:
            return None

    def start(self):
        """
        初始化工作
        :return:
        """
        # 全局更新Headers
        self.session.headers.update(self.headers)
        start_url = self.base_url + '?' + urlencode({
            'query': self.keyword,
            'type': 2
        })
        weixin_request = WeixinRequest(url=start_url,
                                       callback=self.parse_index,
                                       need_proxy=True)
        # 调度第一个请求
        self.queue.add(weixin_request)

    def parse_index(self, response):
        """
        解析索引页
        :param response: 响应
        :return: 新的响应
        """
        doc = pq(response.text)
        items = doc('.news-box .news-list li .txt-box h3 a').items()
        for item in items:
            url = item.attr('href')
            weixin_request = WeixinRequest(url=url, callback=self.parse_detail)
            yield weixin_request
        next = doc('#sogou_next').attr('href')
        if next:
            url = self.base_url + str(next)
            weixin_request = WeixinRequest(url=url,
                                           callback=self.parse_index,
                                           need_proxy=True)
            yield weixin_request

    def parse_detail(self, response):
        """
        解析详情页
        :param response: 响应
        :return: 微信公众号文章
        """
        doc = pq(response.text)
        data = {
            'title':
            doc('.rich_media_title').text(),
            'content':
            doc('.rich_media_content').text(),
            'date':
            doc('#post-date').text(),
            'nickname':
            doc('.js_profile_qrcode > div > strong').text(),
            'wechat':
            doc('#js_profile_qrcode > div > p:nth-child(3) > span').text()
        }
        yield data

    def request(self, weixin_request):
        """
        执行请求
        :param weixin_request: 请求
        :return: 响应
        """
        try:
            if weixin_request.need_proxy:
                proxy = self.get_proxy()
                if proxy:
                    proxies = {
                        'http': 'http://' + proxy,
                        'https': 'https://' + proxy
                    }
                    return self.session.send(weixin_request.prepare(),
                                             timeout=weixin_request.timeout,
                                             allow_redirects=False,
                                             proxies=proxies)
                return self.session.send(weixin_request.prepare(),
                                         timeout=weixin_request.timeout,
                                         allow_redirects=False)
        except (ConnectionError, ReadTimeout) as e:
            print(e.args)
            return False

    def error(self, weixin_request):
        """
        错误处理
        :param weixin_request: 请求
        :return:
        """
        weixin_request.fail_time = weixin_request.fail_time + 1
        print('Request Failed', weixin_request.fail_time, 'Times',
              weixin_request.url)
        if weixin_request.fail_time < MAX_FAILED_TIME:
            self.queue.add(weixin_request)

    def schedule(self):
        """
        调度请求
        :return:
        """
        while not self.queue.empty():
            weixin_request = self.queue.pop()
            callback = weixin_request.callback
            print('Schedule', weixin_request.url)
            response = self.request(weixin_request)
            if response and response.status_code in VALID_STATUSES:
                results = list(callback(response))
                if results:
                    for result in results:
                        print('New Result', type(result))
                        if isinstance(result, WeixinRequest):
                            self.queue.add(result)
                        if isinstance(result, dict):
                            self.mysql.insert('articles', result)
                else:
                    self.error(weixin_request)
            else:
                self.error(weixin_request)

    def run(self):
        """
        入口
        :return:
        """
        self.start()
        self.schedule()
Exemplo n.º 13
0
class Spider:
    base_url = 'http://weixin.sogou.com/weixin'
    keyword = 'NBA'
    headers = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding':
        'gzip, deflate, br',
        'Accept-Language':
        'zh-CN,zh;q=0.9',
        'Cache-Control':
        'no-cache',
        'Connection':
        'keep-alive',
        'Cookie':
        'SUID=089C27B72423910A00000000587EF70E; SUV=00BA7A577157A2CD587EF716A8C08112; SUID=089C27B73020910A00000000587EF70E; CXID=2A1D8FB008EF5B378769F33E6517BC05; ABTEST=1|1543566907|v1; IPLOC=CN4403; JSESSIONID=aaaMubXXxB7JFgNDnA6Cw; weixinIndexVisited=1; sct=1; SNUID=E4EC78F24540395C82F3ABDD4616E1A3',
        'Host':
        'mp.weixin.qq.com',
        'Pragma':
        'no-cache',
        'Upgrade-Insecure-Requests':
        '1',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
    }
    session = Session()
    queue = RedisQueue()
    mysql = MySQL()

    def get_proxy(self):
        """
        从代理池获取代理
        :return:
        """
        try:
            response = requests.get(PROXY_POOL_URL)
            if response.status_code == 200:
                print('Get Proxy', response.text)
                return response.text
            return None
        except requests.ConnectionError:
            return None

    def start(self):
        """
        初始化工作
        """
        # 全局更新Headers
        self.session.headers.update(self.headers)
        start_url = self.base_url + '?' + urlencode({
            'query': self.keyword,
            'type': 2
        })
        weixin_request = WeixinRequest(url=start_url,
                                       callback=self.parse_index,
                                       need_proxy=True)
        # 调度第一个请求
        self.queue.add(weixin_request)

    def parse_index(self, response):
        """
        解析索引页
        :param response: 响应
        :return: 新的响应
        """
        doc = pq(response.text)
        items = doc('.news-box .news-list li .txt-box h3 a').items()
        for item in items:
            url = item.attr('href')
            print(url)
            weixin_request = WeixinRequest(url=url, callback=self.parse_detail)
            yield weixin_request
        next = doc('#sogou_next').attr('href')
        if next:
            url = self.base_url + str(next)
            print(url)
            weixin_request = WeixinRequest(url=url,
                                           callback=self.parse_index,
                                           need_proxy=True)
            yield weixin_request

    def parse_detail(self, response):
        """
        解析详情页
        :param response: 响应
        :return: 微信公众号文章
        """
        doc = pq(response.text)
        # print('article detail', response.text)
        print('article detail', doc)

        data = {
            'title': doc('h2.rich_media_title').text(),
            'content': doc('.rich_media_content').text(),
            'date': doc('#post-date').text(),
            'nickname': doc('#js_name').text(),
            'wechat': doc('#js_name > div > p:nth-child(3) > span').text()
        }
        yield data

    def request(self, weixin_request):
        """
        执行请求
        :param weixin_request: 请求
        :return: 响应
        """
        try:
            if weixin_request.need_proxy:
                proxy = self.get_proxy()
                if proxy:
                    proxies = {
                        'http': 'http://' + proxy,
                        'https': 'https://' + proxy
                    }
                    return self.session.send(weixin_request.prepare(),
                                             timeout=weixin_request.timeout,
                                             allow_redirects=False,
                                             proxies=proxies)
            return self.session.send(weixin_request.prepare(),
                                     timeout=weixin_request.timeout,
                                     allow_redirects=False)
        except (ConnectionError, ReadTimeout) as e:
            print(e.args)
            return False

    def error(self, weixin_request):
        """
        错误处理
        :param weixin_request: 请求
        :return:
        """
        weixin_request.fail_time = weixin_request.fail_time + 1
        print('Request Failed', weixin_request.fail_time, 'Times',
              weixin_request.url)
        if weixin_request.fail_time < MAX_FAILED_TIME:
            #self.queue.add(weixin_request)
            print('skip failed request')

    def schedule(self):
        """
        调度请求
        :return:
        """
        while not self.queue.empty():
            weixin_request = self.queue.pop()
            callback = weixin_request.callback
            print('Schedule', weixin_request.url)
            print('Callback', callback)
            response = self.request(weixin_request)
            print('list', response)
            #print(response.url, ' : ', response.status_code)

            if response and response.status_code in VALID_STATUSES:
                results = list(callback(response))
                if results:
                    for result in results:
                        print('New Result', type(result))
                        print('save data', result)
                        if isinstance(result, WeixinRequest):
                            self.queue.add(result)
                        if isinstance(result, dict):
                            self.mysql.insert('articles', result)
                else:
                    self.error(weixin_request)
            else:
                self.error(weixin_request)

    def run(self):
        """
        入口
        :return:
        """
        self.start()
        self.schedule()
Exemplo n.º 14
0
class spider(object):
    base_url='http://weixin.sogou.com/weixin'
    headers={
    'Accept': 'image / webp, image / apng, image / *, * / *;q = 0.8',
    'Accept - Encoding': 'gzip, deflate,UTF-8',
    'Accept - Language': 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2,mt;q=0.2',
    'Connection': 'keep - alive',
    'Host': 'pb.sogou.com',
    'Referer': 'http: // wx.sogou.com /',
    'User - Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 YaBrowser/18.6.1.770 Yowser/2.5 Safari/537.36'
    }
    ruler=re.compile(r'var publish_time = "(\d+.\d+.\d+)')
    keywords='保加利亚妖王'
    session=requests.Session()
    queue=RedisQueue()
    mysql1=mysql()

    def get_proxy(self):
        """
        代理池的web接口获取代理
        :return:
        """
        try:
          response = requests.get(proxy_pool_web)
          if response.status_code == 200:
              print(response.text)
              return response.text
          return None
        except requests.ConnectionError:
            return None

    def start(self):
        """
        初始化
        :return:
        """
        self.session.headers.update(self.headers)
        start_url = self.base_url +'?'+ urlencode({'query':self.keywords,'type':2})#  不如直接formate
        weixin_request= weixinrequest(callback=self.parse_index,url=start_url,need_proxy=False)#构造第一个请求,以我们定义的类呈现
        self.queue.add(weixin_request)#加入队列

    def parse_index(self,response):
        response.encoding='utf-8'
        doc=etree.HTML(response.text)
        url_details=doc.xpath('//div[@class="txt-box"]/h3/a/@href')
        print('开始解析')
        print(url_details)
        #print(response.text)
        for i in url_details:
            #time.sleep(1)
            wei=weixinrequest(url=i,callback=self.parse_detail,need_proxy=True)
            print(wei.url)
            yield wei

        next_page=doc.xpath('//a[@id="sogou_next"]/@href')
        if next_page:
            time.sleep(2)
            url_nex=self.base_url + str(next_page[0])
            wei_nex=weixinrequest(url=url_nex,callback=self.parse_index,need_proxy=False)
            yield wei_nex


    def parse_detail(self,reponse):
        time.sleep(1)
        reponse.encoding='utf-8'
        doc=pq(reponse.text)
        print("细节解析")
        data={
            'titles': doc('#activity-name').text(),
            'contens':doc('#js_content').text(),
            'date': self.ruler.findall(reponse.text)[0],
            'wechat': doc('#js_name').text()
        }
        print(data)
        yield data

    def requesttt(self,wx):
        """

        :param wx: prepare()方法,转化为prepared request请求
        :return: 响应值,和request.get()得到的reponse差不多
        """
        try:
            if wx.need_proxy:
                proxy=self.get_proxy()
                proxies={
                    'http': 'http://' + proxy,
                    'https': 'https://' + proxy#你需要对 body 或者 header (或者别的什么东西)做一些额外处理,,,,prepare()
                }
                return self.session.send(wx.prepare(),timeout=wx.timeout,allow_redirects=True,proxies=proxies) #.prepare()方法,把weixinrequest中封装的参数,进行预处理,结束后,send()出去,得到reaponse
            return self.session.send(wx.prepare(),timeout=wx.timeout,allow_redirects=True)
        except (requests.ConnectionError,requests.ConnectTimeout) as e:
            print(e.args)

    def error(self,wx):
        wx.fail_time=wx.fail_time +1
        print('request failed %s times %s '%(wx.fail_time,wx.url))
        if wx.fail_time <= Max_failed_time:
            self.queue.add(wx)
    def scheduler(self):
        while not self.queue.empty():
            wx=self.queue.pop()
            callback=wx.callback
            print('schedule %s'% wx.url)
            rseponse=self.requesttt(wx)
            if rseponse and rseponse.status_code in Valid_statues:
                print('zhix')
                results = list(callback(rseponse))
                if results:
                    for result in results:
                        if isinstance(result,weixinrequest):
                            self.queue.add(result)
                        if isinstance(result,dict):
                            self.mysql1.insert('articles',result)
                else:
                    self.error(wx)
            else:
              self.error(wx)

    def run(self):
        self.start()
        self.scheduler()
Exemplo n.º 15
0
class Spider():
    base_url = 'http://weixin.sogou.com/weixin'
    keyword = 'NBA'
    headers = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding':
        'gzip, deflate',
        'Accept-Language':
        'zh-CN,zh;q=0.9',
        'Cache-Control':
        'max-age=0',
        'Connection':
        'keep-alive',
        'Cookie':
        'IPLOC=CN3100; SUV=006058CA7258092E5A365AE5C31C4119; usid=7owwSvanBFp34s-d; ABTEST=8|1535070343|v1; SUID=4AFC5A653F18960A000000005B7F5087; SUID=4AFC5A652E18960A000000005B7F5087; weixinIndexVisited=1; SNUID=76C7665E3B39481850181AC33C52A601; sct=3; JSESSIONID=aaaDu_x19Td39bcXoOBvw',
        'Host':
        'weixin.sogou.com',
        'Upgrade-Insecure-Requests':
        '1',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
    }
    session = Session()
    queue = RedisQueue()
    mysql = MySQL()

    def get_proxy(self):
        """
        从代理池获取代理
        :return: 
        """
        try:
            response = requests.get(PROXY_POOL_URL)
            if response.status_code == 200:
                print('Get Proxy', response.text)
                return response.text
        except requests.ConnectionError:
            return None

    def start(self):
        """
        初始化工作
        """
        # 全局更新Headers
        self.session.headers.update(self.headers)
        start_url = self.base_url + '?' + \
            urlencode({'query': self.keyword, 'type': 2})
        weixin_request = WeixinRequest(url=start_url,
                                       callback=self.parse_index,
                                       need_proxy=False)
        # 调度第一个请求
        self.queue.add(weixin_request)

    def parse_index(self, response):
        """
        解析索引页
        :param response:响应
        :return: 新的响应
        """
        doc = pq(response.text)
        items = doc('.news-box .news-list li .txt-box h3 a').items()
        for item in items:
            url = item.attr('href')
            weixin_request = WeixinRequest(url=url, callback=self.parse_detail)
            yield weixin_request
        next = doc('#sogou_next').attr('href')
        if next:
            url = self.base_url + str(next)
            weixin_request = WeixinRequest(url=url,
                                           callback=self.parse_index,
                                           need_proxy=False)
            yield weixin_request

    def parse_detail(self, response):
        """
        解析详情页
        :param responose:响应
        :return: 微信公众号文章
        """
        doc = pq(response.text)
        data = {
            'title':
            doc('.rich_media_title').text(),
            'content':
            doc('.rich_media_content').text(),
            'date':
            doc('#post-date').text(),
            'nickname':
            doc('#js_profile_qrcode > div > strong').text(),
            'wechat':
            doc('#js_profile_qrcode > div > p:nth-child(3) > span').text()
        }
        yield data

    def error(self, weixin_request):
        """
        错误处理
        :param weixin_request:请求
        :return:
        """
        weixin_request.fail_time = weixin_request.fail_time + 1
        print('Request Failed', weixin_request.fail_time, 'Times',
              weixin_request.url)
        if weixin_request.fail_time < MAX_FAILED_TIME:
            self.queue.add(weixin_request)

    def schedule(self):
        """
        调度请求
        :return:
        """
        while not self.queue.empty():
            weixin_request = self.queue.pop()
            callback = weixin_request.callback
            print('Schedule', weixin_request.url)
            response = self.request(weixin_request)
            print('Response', response)
            if response and response.status_code in VALID_STATUSES:
                results = list(callback(response))
                if results:
                    for result in results:
                        print('New Result', type(result))
                        if isinstance(result, WeixinRequest):
                            self.queue.add(result)
                        if isinstance(result, dict):
                            self.mysql.insert('articles', result)
                else:
                    self.error(weixin_request)
            else:
                self.error(weixin_request)

    def request(self, weixin_request):
        """
        执行请求
        :param weixin_request:请求
        :return: 响应
        """
        try:
            if weixin_request.need_proxy:
                proxy = self.get_proxy()
                if proxy:
                    proxies = {
                        'http': 'http://' + proxy,
                        'https:': 'https://' + proxy
                    }
                    return self.session.send(weixin_request.prepare(),
                                             timeout=weixin_request.timeout,
                                             allow_redirects=False,
                                             proxies=proxies)
            return self.session.send(weixin_request.prepare(),
                                     timeout=weixin_request.timeout,
                                     allow_redirects=False)
        except (ConnectionError, ReadTimeout) as e:
            print(e.args)
            return False

    def run(self):
        """
        入口
        :return:
        """
        self.start()
        self.schedule()