Пример #1
0
def crawl():
    urls = [
        'http://www.xicidaili.com/nn/', 'http://www.xicidaili.com/nn/2',
        'http://www.xicidaili.com/wn/'
    ]
    result = []
    for url in urls:
        try:
            req = logic_common.build_request(url)
            table = BeautifulSoup(req.text,
                                  'lxml').find('table',
                                               id='ip_list').find_all('tr')
        except Exception as e:
            error_log.error('Spider xicidaili error.[msg]={}'.format(e))
            continue
        for tr in table[1:]:
            try:
                tds = tr.find_all('td')
                ip = tds[1].get_text() + ':' + tds[2].get_text()
                result.append(ip)
            except:
                pass
    info_log.info('Spider xicidaili success.Crawled IP Count:{}'.format(
        len(result)))
    return result
Пример #2
0
def crawl():
    urls = [
        'http://www.data5u.com/free/gngn/index.shtml',
        'http://www.data5u.com/free/gwgn/index.shtml',
        'http://www.data5u.com/free/gnpt/index.shtml',
        'http://www.data5u.com/free/gwpt/index.shtml'
    ]
    result = []
    for url in urls:
        try:
            req = logic_common.build_request(url)
            table = BeautifulSoup(req.text,
                                  'lxml').find_all('ul', {"class": 'l2'})
        except Exception as e:
            error_log.error('Spider data5u error.[msg]={}'.format(e))
            continue
        for item in table[1:]:
            try:
                spans = item.find_all('span')
                ip = spans[0].get_text()
                port = spans[1].get_text()
            except:
                continue
            line = ip + ':' + port
            result.append(
                line.replace('\r',
                             '').replace('\n',
                                         '').replace('\t',
                                                     '').replace(' ', ''))
    info_log.info('Spider data5u success.Crawled IP Count:{}'.format(
        len(result)))
    return result
Пример #3
0
def verify_ip(ip_item):
    """
    :param ip_item:
    :return:
    """
    status = is_available('%s:%s' % (ip_item['ip'], ip_item['ip_port']))
    if status:
        ip_item['utime'] = int(time.time())
        try:
            proxypool.update_ip(ip_item)
        except Exception as e:
            error_log.error('update_ip fail.[item]={} [msg]={}'.format(
                json.dumps(ip_item), e))
            return
        info_log.info('update_ip success.[ip_item]={}'.format(
            json.dumps(ip_item)))
    else:
        try:
            proxypool.delete_ip(ip_item)
        except Exception as e:
            error_log.error('delete_ip fail.[item]={} [msg]={}'.format(
                json.dumps(ip_item), e))
            return
        info_log.info('delete_ip success.[ip_item]={}'.format(
            json.dumps(ip_item)))
Пример #4
0
def crawl():
    result = []
    for page in range(5):
        url = 'https://proxy.coderbusy.com/classical/anonymous-type/highanonymous.aspx?page=%s' % (
            page + 1)
        try:
            req = logic_common.build_request(url)
            table = BeautifulSoup(req.text,
                                  'lxml').find('div', {
                                      'class': 'table-responsive'
                                  }).find_all('tr')
        except Exception as e:
            error_log.error('Spider CoderBusy error.[msg]={}'.format(e))
            continue
        for item in table[1:]:
            try:
                tds = item.find_all('td')
                ip = tds[0].get_text()
                port = tds[2].get_text()
            except:
                continue
            line = ip + ':' + port
            result.append(
                line.replace('\r',
                             '').replace('\n',
                                         '').replace('\t',
                                                     '').replace(' ', ''))
    info_log.info('Spider CoderBusy success.Crawled IP Count:{}'.format(
        len(result)))
    return result
Пример #5
0
def crawl():
    urls = ['http://www.89ip.cn/tiqv.php?sxb=&tqsl=300&ports=&ktip=&xl=on&submit=%CC%E1++%C8%A1']
    result = []
    for pageurl in urls:
        try:
            req = logic_common.build_request(pageurl)
            html = req.text
        except Exception as e:
            error_log.error('Spider 89ip error.[msg]={}'.format(e))
            continue
        ips = re.findall('\d+\.\d+\.\d+\.\d+:\d+', html)
        result += ips
        time.sleep(2)
    info_log.info('Spider 89ip success.Crawled IP Count:{}'.format(len(result)))
    return result
Пример #6
0
def crawl():
    urls = [
        'http://www.66ip.cn/nmtq.php?getnum=600&isp=0&anonymoustype=0&start=&ports=&export=&ipaddress=&area=0&proxytype=2&api=66ip'
    ]
    result = []
    for pageurl in urls:
        try:
            req = logic_common.build_request(pageurl)
            html = req.text
        except Exception as e:
            error_log.error('Spider 66ip error.[msg]={}'.format(e))
            continue
        ips = re.findall('\d+\.\d+\.\d+\.\d+:\d+', html)
        result += ips
        time.sleep(2)
    info_log.info('Spider 66ip success.Crawled IP Count:{}'.format(
        len(result)))
    return result
Пример #7
0
def insert_into_proxypool(ip):
    """代理池中插入ip"""

    status = is_available(ip)
    if not status:
        return
    ip_item = {
        'ip': ip.split(':')[0],
        'ip_port': ip.split(':')[-1],
        'ctime': int(time.time()),
        'utime': int(time.time())
    }
    try:
        proxypool.insert_into_proxypool(ip_item)
    except Exception as e:
        error_log.error(
            'insert_into_proxypool fail.[ip_item]={} [msg]={}'.format(
                json.dumps(ip_item), e))
        return
    info_log.info('insert_into_proxypool success.[ip_item]={}'.format(
        json.dumps(ip_item)))
Пример #8
0
def crawl():
    result = []
    for page in range(1, 10):
        url = 'https://www.kuaidaili.com/ops/proxylist/{}/'.format(page)
        try:
            req = logic_common.build_request(url)
            table = BeautifulSoup(req.text, 'lxml').find(
                'div', {'id': 'freelist'}).find('table').find_all('tr')
        except Exception as e:
            error_log.error('Spider kuaidaili error.[msg]={}'.format(e))
            continue
        for tr in table[1:]:
            try:
                ip = tr.find('td', {'data-title': 'IP'}).get_text()
                port = tr.find('td', {'data-title': 'PORT'}).get_text()
                ip = ip + ':' + port
                result.append(ip)
            except:
                pass
    info_log.info('Spider kuaidaili success.Crawled IP Count:{}'.format(len(result)))
    return result