示例#1
0
def get_valid_proxy(target_url, ip_set, referer='https://www.google.com'):
    """extract a valid proxy for target_url from redis
    
    Args:
        target_url (str): url that need to visite with a proxy
        ip_set (str): the set in redis that stores proxies
        referer (str, optional): referer to construct headers for testing whether proxy is valid 
    
    Returns:
        curr_proxy(str): a valid proxy in the format of ip:port
    """
    try:
        conn = get_connection()
        proxies = conn.srandmember(ip_set, 5)
        curr_proxy = proxies.pop()
        # if proxy is not valid, delete it from redis
        while not is_valid(target_url, curr_proxy, referer):
            conn.srem(ip_set, curr_proxy)
            if len(proxies) == 0:
                proxies = conn.srandmember(ip_set, 5)
            curr_proxy = proxies.pop()
        return curr_proxy
    except Exception, e:
        print 'Error while getting proxy from redis\n%s' % e.message
        sys.exit(0)
示例#2
0
def get_proxies(proxy_type, ip_set, start_page, end_page):
    """extract proxies from page source code, store them in redis
    
    Args:
        proxy_type (str): base url for proxy type, like the global variables CHINA and OTHER
        ip_set (str): which set should the ips be stored in redis
        start_page (int):  which page to start crawling
        end_page (int): which page to stop crawling
    """
    try:
        conn = get_connection()
    except Exception:
        print 'Error while connecting to redis'
        return
    proxies, curr_proxy = [], None
    for page in xrange(start_page, end_page + 1):
        if page % 2 == 0:
            time.sleep(20)
        # get page source code
        headers = {
            'user-agent': generate_user_agent(),
            'referer': 'http://www.xicidaili.com/'
        }
        text = requests.get(proxy_type + str(page), headers=headers).text
        # extract ips from source code
        soup = BeautifulSoup(text, 'lxml')
        for tr in soup.find_all('tr')[1:]:
            tds = tr.find_all('td')
            #if u'美国' in tds[3].text:
            proxy = tds[1].text + ':' + tds[2].text
            if is_valid('https://www.amazon.com/', proxy):
                conn.sadd(ip_set, proxy)
                print '%s added to ip set %s' % (proxy, ip_set)
示例#3
0
def get_valid_proxy(target_url, ip_set, referer = 'https://www.google.com'):
    """extract a valid proxy for target_url from redis
    
    Args:
        target_url (str): url that need to visite with a proxy
        ip_set (str): the set in redis that stores proxies
        referer (str, optional): referer to construct headers for testing whether proxy is valid 
    
    Returns:
        curr_proxy(str): a valid proxy in the format of ip:port
    """
    try:
        conn = get_connection()
        proxies = conn.srandmember(ip_set, 5)
        curr_proxy  = proxies.pop()
        # if proxy is not valid, delete it from redis
        while not is_valid(target_url, curr_proxy, referer):
            conn.srem(ip_set, curr_proxy)
            if len(proxies) == 0:
                proxies = conn.srandmember(ip_set, 5)
            curr_proxy = proxies.pop()
        return curr_proxy
    except Exception, e:
        print 'Error while getting proxy from redis\n%s'%e.message
        sys.exit(0)
示例#4
0
def get_proxies(proxy_type, ip_set, start_page, end_page):
    """extract proxies from page source code, store them in redis
    
    Args:
        proxy_type (str): base url for proxy type, like the global variables CHINA and OTHER
        ip_set (str): which set should the ips be stored in redis
        start_page (int):  which page to start crawling
        end_page (int): which page to stop crawling
    """
    try:
        conn = get_connection()
    except Exception:
        print 'Error while connecting to redis'
        return
    proxies, curr_proxy =[], None
    for page in xrange(start_page, end_page+1):
        if page % 2 == 0:
            time.sleep(20)
        # get page source code
        headers = {'user-agent': generate_user_agent(), 'referer': 'http://www.xicidaili.com/'}
        text = requests.get(proxy_type+str(page), headers = headers).text
        # extract ips from source code
        soup = BeautifulSoup(text, 'lxml')
        for tr in soup.find_all('tr')[1:]:
            tds = tr.find_all('td')
            #if u'美国' in tds[3].text:
            proxy = tds[1].text+':'+tds[2].text               
            if is_valid('https://www.amazon.com/', proxy):
                conn.sadd(ip_set, proxy)
                print '%s added to ip set %s' %(proxy, ip_set)
示例#5
0
def get_user_names():
    r = get_connection(DB=1)
    with open('names') as f:
        for line in f:
            # print line.strip().title()
            r.sadd('user_name', line.strip().title())
示例#6
0
    name_visa = info[0]['value'] + '#' + info[11]['value'] + '#' + info[13][
        'value']
    print name_phone, name_visa
    return name_phone, name_visa


def get_user_names():
    r = get_connection(DB=1)
    with open('names') as f:
        for line in f:
            # print line.strip().title()
            r.sadd('user_name', line.strip().title())


if __name__ == '__main__':
    r = get_connection(DB=3)
    crawl_address, crawl_phone_visa = True, False
    if crawl_address:
        count = 0
        while True:
            if count % 10 == 0:
                proxy = get_valid_proxy(
                    'https://fakena.me/random-real-address/',
                    'china_ips',
                    referer=r'https://fakena.me')
                print 'current proxy: %s' % proxy
            addr = get_address(proxy)
            if addr:
                r.sadd('address', addr)
                print 'successfully add address %s to redis' % addr
            count += 1
示例#7
0
def get_user_names():
    r = get_connection(DB=1)
    with open('names') as f:
        for line in f:
            # print line.strip().title()
            r.sadd('user_name', line.strip().title())
示例#8
0
    name_phone =  info[0]['value']+'#'+info[9]['value']
    name_visa = info[0]['value']+'#'+info[11]['value']+'#'+info[13]['value']
    print name_phone, name_visa
    return name_phone, name_visa


def get_user_names():
    r = get_connection(DB=1)
    with open('names') as f:
        for line in f:
            # print line.strip().title()
            r.sadd('user_name', line.strip().title())


if __name__ == '__main__':
    r = get_connection(DB = 3)
    crawl_address, crawl_phone_visa = True, False
    if crawl_address:
        count = 0
        while True:
            if count % 10 == 0:
                proxy = get_valid_proxy('https://fakena.me/random-real-address/', 'china_ips', referer = r'https://fakena.me')
                print 'current proxy: %s'%proxy
            addr = get_address(proxy) 
            if addr:   
                r.sadd('address', addr)
                print 'successfully add address %s to redis'%addr
            count += 1
            time.sleep(5)
    elif crawl_phone_visa:
        while True: