def crawl(parent_url): print('do crawl') cookie = http.cookiejar.CookieJar() handler = urllib3.HTTPCookieProcessor(cookie) opener = urllib3.build_opener(handler) response = opener.open(parent_url) html = response.read().decode('gbk', 'ignore') html = html.replace('\\/', '/') new_url_list = [] sku_information = [] links = re.findall(r'href\=\"(\/\/[a-zA-Z0-9\.\/\-]+)\"', html) for link in links: # python 多个and啥意思来着 if link.find('jd.com') > 0 and link.find( 'club.jd.com') < 0 and link.find( 'item.m.jd.com') < 0 and link.find( 'help.jd.com') < 0 and link.find('yp.jd.com') < 0: link = 'http:' + link new_url_list.append(link) if parent_url.find('item.jd.com') >= 0: id_reg_exp = '[\d]+' all_found = re.findall(id_reg_exp, parent_url) sku_information.append(all_found[0]) for item in retrieve_sku_info(html): sku_information.append(item) print('sku', sku_information) #print sku_information return new_url_list, sku_information
def download_url_list(): cookie = cookielib.CookieJar() handler = urllib3.HTTPCookieProcessor(cookie) opener = urllib3.build_opener(handler) url = 'http://www.jd.com' response = opener.open(url) html = response.read().decode('utf-8', 'ignore') start_pattern = 'class=\"cate_menu_lk\"' end_pattern = '</a>' start_position = html.find(start_pattern) link_str = '' while start_position > 0: html = html[start_position + len(start_pattern):] end_position = html.find(end_pattern) if end_position > 0: link_str += html[0:end_position] start_position = html.find(start_pattern) #print link_str url_re = '\/\/[a-z.\/\d\-]+' url_list = [] for url in re.findall(url_re, link_str): url = 'http:' + url url_list.append(url) #print url_list return url_list
def doHttpWithCookie(self, url, data={}, save_cookie = False): cookie = cookielib.CookieJar() cookie.load(self.cookie_filename, ignore_discard=True, ignore_expires=True) if save_cookie: cookie = cookielib.MozillaCookieJar(self.cookie_filename) handler = urllib3.HTTPCookieProcessor(cookie) opener = urllib3.build_opener(handler) response = opener.open(url) for item in cookie: print('Name = ' + item.name) print('Value = ' + item.value) if save_cookie: cookie.save(ignore_discard=True, ignore_expires=True)
def Brower(url): login_page = "https://10.64.70.225/cgi-bin/logon.cgi" login_data = "usrname=admin&passwd=admin&isCookieEnable=1&action=on&wrong_passwd=%3C%21--invalid_passwd_flag--%3E" # get from fiddler try: cj = cookielib.CookieJar() opener = urllib3.build_opener(urllib3.HTTPCookieProcessor(cj)) opener.addheaders = [ ('User-agent', 'Mozilla/4.0 (compatible; MSIE 8.0;\ Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152;\ .NET CLR 3.5.30729; MS-RTC LM 8; InfoPath.2; CIBA; .NET4.0C; .NET4.0E; .NET CLR 1.1.4322)' ) ] opener.open(login_page, login_data) op = opener.open(url) data = op.read() return data except Exception: print(str(Exception))
import urllib3 import cookie print('start') cookie = cookielib.CookieJar() opener = urllib3.build_opener(urllib3.HTTPCookieProcessor(cookie)) response = opener.open('http://www.zhihu.com') for item in cookie: print(item.name +':' item.value) print('over')
import urllib3 import cookielib from bs4 import BeautifulSoup #设置代理IP proxy_support = urllib3.ProxyHandler({'http':'120.197.234.164:80'}) #设置cookie cookie_support = urllib3.HTTPCookieProcessor(cookielib.LWPCookieJar()) opener = urllib3.build_opener(proxy_support,cookie_support,urllib.HTTPHandler) urllib3.install_opener(opener) #开始的URL #hosturl = "http://www.renren.com" hosturl = "http://mail.163.com/" #接受表单数据的URL #posturl = "http://www.renren.com/ajaxLogin/login" posturl = "https://mail.163.com/entry/cgi/ntesdoor?df=mail163_letter&from=web&funcid=loginone&iframe=1&language=-1&passtype=1&product=mail163&net=e&style=-1&race=118_35_39_bj&[email protected]" #发送表单数据 postdata = urllib.urlencode( { "username":"******", "password":"******" } ) #设置表头 headers = { #'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0/', #'Referer':'http://www.renren.com/' 'User-Agent':"Mozilla/5.0 (Windows NT 6.3; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0", 'Referer':'http://mail.163.com/' } #生成HTTP请求 req =urllib.Request(
def getJsonReponse(tweetCriteria, refreshCursor, cookieJar, proxy): url = "https://twitter.com/i/search/timeline?f=tweets&q=%s&src=typd&max_position=%s" urlGetData = '' if hasattr(tweetCriteria, 'username'): urlGetData += ' from:' + tweetCriteria.username if hasattr(tweetCriteria, 'querySearch'): urlGetData += ' ' + tweetCriteria.querySearch if hasattr(tweetCriteria, 'near'): urlGetData += "&near:" + tweetCriteria.near + " within:" + tweetCriteria.within if hasattr(tweetCriteria, 'since'): urlGetData += ' since:' + tweetCriteria.since if hasattr(tweetCriteria, 'until'): urlGetData += ' until:' + tweetCriteria.until if hasattr(tweetCriteria, 'topTweets'): if tweetCriteria.topTweets: url = "https://twitter.com/i/search/timeline?q=%s&src=typd&max_position=%s" url = url % (urllib.quote(urlGetData), urllib.quote(refreshCursor)) headers = [ ('Host', "twitter.com"), ('User-Agent', "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36" ), ('Accept', "application/json, text/javascript, */*; q=0.01"), ('Accept-Language', "de,en-US;q=0.7,en;q=0.3"), ('X-Requested-With', "XMLHttpRequest"), ('Referer', url), ('Connection', "keep-alive") ] if proxy: opener = urllib2.build_opener( urllib2.ProxyHandler({ 'http': proxy, 'https': proxy }), urllib2.HTTPCookieProcessor(cookieJar)) else: opener = urllib2.build_opener( urllib2.HTTPCookieProcessor(cookieJar)) opener.addheaders = headers try: response = opener.open(url) jsonResponse = response.read() except: print( "Twitter weird response. Try to see on browser: https://twitter.com/search?q=%s&src=typd" % urllib.quote(urlGetData)) sys.exit() return dataJson = json.loads(jsonResponse) return dataJson