Exemplo n.º 1
0
 def process_request(self, request, spider):
     if self.count % 200 == 0:
         print 'fetcing proxys ..........'
         self.PROXIES = fetch_all()
     proxy = random.choice(self.PROXIES)
     request.meta['proxy'] = "http://%s" % proxy
     self.count += 1
Exemplo n.º 2
0
    def fetch_new_proxyes(self):
        logger.info("extending proxyes using fetch_free_proxyes.py")
        new_proxyes = fetch_free_proxyes.fetch_all()
        logger.info("new proxyes: %s" % new_proxyes)

        for np in new_proxyes:
            if np in self.proxyes:
                continue
            else:
                self.proxyes[np] = {'count': 0, 'valid': True, 'last': 0}
Exemplo n.º 3
0
    def process_request(self, request, spider):
        curr_time = time.time()
        local_proxy = '127.0.0.1:80'

        if not MyGlobals.last_fetch_time or curr_time > MyGlobals.last_fetch_time + FETCH_PROXY_INTERVAL:
            log.msg('start fetch proxy ips...', level=log.INFO)
            MyGlobals.proxy_list = fetch_all()
            MyGlobals.proxy_list.append(local_proxy)
            MyGlobals.last_fetch_time = curr_time

        proxy = random.choice(MyGlobals.proxy_list)
        if local_proxy == proxy:
            log.msg('*******current proxy is:local', level=log.INFO)
        else:
            log.msg('*******current proxy is:{0}'.format(proxy))
            request.meta['proxy'] = "http://%s" % proxy
Exemplo n.º 4
0
    def fetch_new_proxyes(self):
        """
        从网上抓取新的代理添加到代理列表中
        """
        logger.info("extending proxyes using fetch_free_proxyes.py")
        new_proxyes = fetch_free_proxyes.fetch_all(https=self.use_https)
        logger.info("new proxyes: %s" % new_proxyes)
        self.last_fetch_proxy_time = datetime.now()

        for np in new_proxyes:
            if self.url_in_proxyes(np):
                continue
            else:
                self.proxyes.append({"proxy": np, "valid": True, "count": 0})
        if self.len_valid_proxy(
        ) < self.extend_proxy_threshold:  # 如果发现抓不到什么新的代理了, 缩小threshold以避免白费功夫
            self.extend_proxy_threshold -= 1
    def fetch_new_proxyes(self):
        """
        从网上抓取新的代理添加到代理列表中
        """
        logger.info("extending proxyes using fetch_free_proxyes.py")
        new_proxyes = fetch_free_proxyes.fetch_all()
        logger.info("new proxyes: %s" % new_proxyes)
        self.last_fetch_proxy_time = datetime.now()

        for np in new_proxyes:
            if self.url_in_proxyes("http://" + np):
                continue
            else:
                self.proxyes.append({"proxy": "http://"  + np,
                                     "valid": True,
                                     "count": 0})
        if self.len_valid_proxy() < self.extend_proxy_threshold: # 如果发现抓不到什么新的代理了, 缩小threshold以避免白费功夫
            self.extend_proxy_threshold -= 1
Exemplo n.º 6
0
    def fetch_new_proxyes(self):
        """
        从网上抓取新的代理添加到代理列表中
        """
        self.logger.info("extending proxyes using fetch_free_proxyes.py")
        new_proxyes = fetch_free_proxyes.fetch_all(log=self.logger)
        self.logger.info("new proxyes: %s" % new_proxyes)
        self.last_fetch_proxy_time = datetime.now()
        next_index = None

        for np in new_proxyes:
            if self.url_in_proxyes("http://" + np):
                continue
            else:
                new_one = {"proxy": "http://" + np, "valid": True, "count": 0}
                self.proxyes.append(new_one)
                if not next_index:
                    next_index = self.proxyes.index(new_one)
        if next_index: self.proxy_index = next_index
        if self.len_valid_proxy(
        ) < self.extend_proxy_threshold:  # 如果发现抓不到什么新的代理了, 缩小threshold以避免白费功夫
            self.extend_proxy_threshold -= 1
Exemplo n.º 7
0
 def updateProxyAddr(self):
     self.pushall(dict(zip(fetch_all(log=self.logger), self._yield())))
Exemplo n.º 8
0
def click(url, content, socketio=None, proxy=False):
    proxies = [None]
    if proxy:
        if socketio:
            socketio.sleep(1)
            socketio.emit('my_response',
                          {'data': '免费代理获取中  \n这可能花费几分钟,请稍后...'},
                          namespace='/patent')
        proxies = fproxy.fetch_all()
        proxies = [{'http': 'http://' + x} for x in proxies]
        if socketio:
            socketio.sleep(1)
            socketio.emit('my_response',
                          {'data': '免费代理获取中  \n这可能花费几分钟,请稍后...'},
                          namespace='/patent')
    form = form_produce(content)
    num = get_page_nums(url, form)
    logger.info(num)
    mongo = mongoConnection.mongoConnection(db='patent',
                                            collection='patentinfo')
    i = 1
    if not num and socketio:
        socketio.emit('my_response', {'data': '目标网站连接失败,请稍后重试!'},
                      namespace='/patent')
        socketio.emit('disconnect', {'data': 'disconnect'},
                      namespace='/patent')
        return
    while i <= num:
        failed_tag = 0
        attempt = 0
        form = form_produce(content, i)
        proxie = random.choice(proxies)
        patents = get_patent(url, form, proxie)
        while patents is None:
            logger.debug('失败次数为:' + str(attempt + 1) + str(failed_tag))
            failed_tag += 1
            attempt += 1
            if attempt % 3 == 0:
                attempt = 0
                break
            if failed_tag % 10 == 0:
                logger.info("抓取新代理,请稍等")
                if socketio:
                    socketio.sleep(1)
                    socketio.emit('my_response', {'data': '抓取新代理,请稍等'},
                                  namespace='/patent')
                proxies = fproxy.fetch_all()
                proxies = [{'http': 'http://' + x} for x in proxies]
            proxie = random.choice(proxies)
            # print('新换ip代理为:',proxie)
            patents = get_patent(url, form, proxie)

        failed_tag = 0
        if patents != -1:
            try:
                for x in patents['titles']:
                    logger.info('title:' + x)
                    if socketio:
                        socketio.sleep(1)
                        socketio.emit('my_response', {'data': 'title:' + x},
                                      namespace='/patent')
                store(patents, str(content['_id']))
            except Exception as e:
                logger.debug(e)
                logger.debug('插入数据库失败...')
        i += 50
Exemplo n.º 9
0
 def updateProxyAddr(self):
     self.pushall(dict(zip(fetch_all(log=self.logger), self._yield())))
Exemplo n.º 10
0
def click(url, socketio=None, proxy=False):
    logger.info('免费代理获取中  \n这可能花费几分钟,请稍后...')
    if socketio:
        socketio.emit('my_response', {'data': '免费代理获取中  \n这可能花费几分钟,请稍后...'},
                      namespace='/paper')
        socketio.sleep(1)

    proxies = [None]
    if proxy:
        proxies = fproxy.fetch_all()
        proxies = [{'http': 'http://' + x} for x in proxies]
    logger.info('免费代理获取完毕,总共%d条。' % len(proxies))
    if socketio:
        socketio.sleep(1)
        socketio.emit('my_response',
                      {'data': '免费代理获取完毕,总共%d条。' % len(proxies)},
                      namespace='/paper')
    num = get_page_nums(url)
    logger.info(num)
    mongo = mongoConnection.mongoConnection(db='wanFang',
                                            collection='paperinfo')
    # proxies = [None]
    # i = page_index(PAGEINDEX,'r')
    i = 0
    while i <= (num + 9) // 10:
        failed_tag = 0
        new_url = url + '&p=' + str(i)
        # print(new_url)
        proxie = random.choice(proxies)
        item = get_url(new_url, proxie)
        # print('item',item)
        if item is not None and item['paper_urls'] != []:
            for paper_url, quote in zip(item['paper_urls'], item['quotes']):
                attempt = 0
                papers = get_paper(paper_url, quote, proxie)
                while papers is None:
                    # print ('失败次数为:',attempt+1,failed_tag)
                    failed_tag += 1
                    attempt += 1
                    if attempt % 3 == 0:
                        attempt = 0
                        break
                    if failed_tag % 10 == 0:
                        logger.info("抓取新代理,请稍等")
                        proxies = fproxy.fetch_all()
                        proxies = [{'http': 'http://' + x} for x in proxies]
                    proxie = random.choice(proxies)
                    # print('新换ip代理为:',proxie)
                    papers = get_paper(paper_url, quote, proxie)

                if papers is not None and papers != -1:
                    failed_tag = 0
                    logger.info('papers:' + papers['title'])
                    if socketio:
                        socketio.emit('my_response',
                                      {'data': 'papers:' + papers['title']},
                                      namespace='/paper')
                        socketio.sleep(1)
                    try:
                        papers['url'] = url
                        papers = clean.clean(papers)  #按照规定格式格式化
                        mongo.collection.insert(papers)
                    except Exception as e:
                        logger.debug(e)
                        logger.debug(papers)
                        logger.debug('插入数据库失败...')
                    # fwrite(FNAME, papers)

            i += 1  #此页数据访问成功,开始访问下一页,否则重新选择代理访问
            page_index(PAGEINDEX, 'w', str(i + 1))
        else:
            failed_tag += 1
            if failed_tag % 3 == 0:
                proxies = fproxy.fetch_all()
                proxies = [{'http': 'http://' + x} for x in proxies]