def get_aid_page(aid_url): try: req = urllib.request.Request(aid_url) # 设置请求头部信息 req.add_header("Host", "www.bilibili.com") req.add_header("User-Agent", "Mozilla/5.0 (X11; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0") req.add_header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") req.add_header("Accept-Language", "zh-CN,en-US;q=0.7,en;q=0.3") req.add_header("Connection", "keep-alive") req.add_header("Accept-Encoding", "gzip, deflate") response = urllib.request.urlopen(req, timeout=10) if response.info()["Content-Encoding"] == "gzip": buf = io.BytesIO(response.read()) f = gzip.GzipFile(fileobj=buf) aid_page_content = f.read().decode("utf-8") else: aid_page_content = response.read().decode('utf-8') response.close() #print(aid_page_content) return aid_page_content except Exception as e: raise e finally: if response: response.close()
def _GetAuthCookie(self, auth_token): """Fetches authentication cookies for an authentication token. Args: auth_token: The authentication token returned by ClientLogin. Raises: HTTPError: If there was an error fetching the authentication cookies. """ continue_location = "http://localhost/" args = {"continue": continue_location, "auth": auth_token} login_path = os.environ.get("APPCFG_LOGIN_PATH", "/_ah") req = self._CreateRequest("%s://%s%s/login?%s" % (self.scheme, self.host, login_path, urllib.parse.urlencode(args))) try: response = self.opener.open(req) except urllib.error.HTTPError as e: response = e if (response.code != 302 or response.info()["location"] != continue_location): raise urllib.error.HTTPError(req.get_full_url(), response.code, response.msg, response.headers, response.fp) self.authenticated = True
def proxyHttpBrowser(self, url): print(url) enable_proxy = True proxy_handler=urllib.request.ProxyHandler({"http":"http://some-proxy.com:8080"}) handler = urllib.request.HTTPCookieProcessor(self.cookie) if enable_proxy: opener = urllib.request.build_opener(proxy_handler) else: opener = urllib.request.build_opener(handler) #安装全局opener urllib.request.urlopen将使用此全局opener urllib.request.install_opener(opener) req = urllib.request.Request(url) try: response = urllib.request.urlopen(req) #描述了获取的页面情况描述了获取的页面情况 通常是服务器发送的特定头headers print(response.info()) html = response.read() except urllib.error.HTTPError as e: print('HTTPCODE:', e.code) html=None except urllib.error.URLError as e: print(e.reason) html = None self.cookie.save(ignore_discard=True, ignore_expires=True) return html
def _GetAuthCookie(self, auth_token): """Fetches authentication cookies for an authentication token. Args: auth_token: The authentication token returned by ClientLogin. Raises: HTTPError: If there was an error fetching the authentication cookies. """ continue_location = "http://localhost/" args = {"continue": continue_location, "auth": auth_token} login_path = os.environ.get("APPCFG_LOGIN_PATH", "/_ah") req = self._CreateRequest( "%s://%s%s/login?%s" % (self.scheme, self.host, login_path, urllib.parse.urlencode(args))) try: response = self.opener.open(req) except urllib.error.HTTPError as e: response = e if (response.code != 302 or response.info()["location"] != continue_location): raise urllib.error.HTTPError(req.get_full_url(), response.code, response.msg, response.headers, response.fp) self.authenticated = True
def annotateResponse(self, response): info = response.info() try: mimeType, sep, mimeInfo = info["Content-Type"].partition(";") m = self.charsetRE.search(mimeInfo) if m is not None: encoding = m.group(1) else: encoding = None mimeType = mimeType.strip() except AttributeError: mimeType = "unknown/unknown" encoding = None except KeyError: mimeType = "unknown/unknown" encoding = None try: response.handler = self.mimeMap[mimeType] except KeyError: print("fallback") for glob, handler in self.globMap: if fnmatch(mimeType, glob): response.handler = handler break else: raise URLLookupError("No handler for MIME type: {0}") response.mimeType = mimeType response.encoding = encoding self.bufferResponse(response, info) response.url = urllib.parse.urlparse(response.geturl())
def add_dummy_subscription(data): rest_handler() request_url = top_level_url + "api/subscription" req = urllib.request.Request(request_url) req.add_header('Content-Type', 'application/json; charset=utf-8') jsondata = json.dumps(data) jsondataasbytes = jsondata.encode('utf-8') # needs to be bytes req.add_header('Content-Length', len(jsondataasbytes)) try: response = urllib.request.urlopen(req, jsondataasbytes) answer = json.loads(response.read().decode( response.info().get_param('charset') or 'utf-8')) except HTTPError as e: print('add_dummy_subscription HTTPError code: ', e.code) return None except URLError as e: print('add_dummy_subscription URLError Reason: ', e.reason) return None else: print(answer['data'][0]) return answer['data'][0]
def post_register_user(userdata, url, callback): rest_handler_with_username("", "") request_url = top_level_url + "api/userregister" req = urllib.request.Request(request_url) req.add_header('Content-Type', 'application/json; charset=utf-8') jsondata = json.dumps(userdata) jsondataasbytes = jsondata.encode('utf-8') # needs to be bytes req.add_header('Content-Length', len(jsondataasbytes)) try: response = urllib.request.urlopen(req, jsondataasbytes) answer = json.loads(response.read().decode( response.info().get_param('charset') or 'utf-8')) except HTTPError as e: print('post_jsondata HTTPError code: ', e.code) return False except URLError as e: print('post_jsondata URLError Reason: ', e.reason) return False else: print(answer['data']) idaccounts = answer['data'][0]['idaccounts'] post_register_user_image(idaccounts, userdata, url, callback) return True
def fetch_page(page): response = urllib.request.urlopen(page) if response.info().get('Content-Encoding') == 'gzip': response_buffer = StringIO(response.read()) unzipped_content = gzip.GzipFile(fileobj=response_buffer) return unzipped_content.read() else: return response.read()
def Start_split(url, client_count): url = url client_count = client_count writepath = 'file.txt' mode = 'ab' if os.path.exists(writepath) else 'wb+' req = HeadRequest(url) response = urllib.request.urlopen(req) response.close() print("Fileinfo ==>") print(response.info()) strRes = str(response.info()) contentlength = int(response.getheader("Content-Length")) global newLength newLength = contentlength print("N-Division requests") print("\tNo. of clients:", client_count) print("\tFileSize in bytes:", contentlength) # logging app.insert("\nN-Division requests") app.insert("\n\tNo. of clients:" + str(client_count)) app.insert("\n\tFileSize in bytes:" + str(contentlength)) #seekmer.create(contentlength) #print("sample file of content length created") urlRangeList = n_division(client_count, contentlength) for a in urlRangeList: print(a) app.insert("\n" + a) requests = [] for x in urlRangeList: ss = "urllib.request.Request(" + url + ", headers={'Range':" + x + "})" requests.append(ss) # pass urlRangeList[i] to the clients_list[i] for i in range(client_count): clients = [[url, xx] for xx in urlRangeList] for test in clients: print(test) print("done") return clients
def search_google_query(queryurl): try: response = urllib.request.urlopen(queryurl) #Debugging purpose #print('\nThe response is :') #print(response) resp_html = response.read() encoding = response.info().get_content_charset('utf-8') resp_json = json.loads(resp_html.decode(encoding)) # Once we have the JSON we print the htmlTitle and link of each of the response items using pprint print("\nSearch Response from Google are:\n") i=0 while(i<5): pprint(resp_json['items'][i]['htmlTitle']) pprint(resp_json['items'][i]['link']) #htmlSnippet provides couple of extra lines on the search item. I have commented this out to provide a #bit of clarity. #pprint(resp_json['items'][i]['htmlSnippet']) i = i+1 print('\n\n') # Was using the below to understand the google response to find which objects I need to print out. #pprint (m) #fres = open("D:\\Python\\temp.txt", "w+") #fres.write(str(m)) #fres.close() #print(html) except urllib.error.URLError as e: # Prints any exception that is thrown from urllib.request.urlopen print("Code: %d, Reason: %s" % (e.code, e.reason)) except: print("Unexpected Error", sys.exc_info()[0]) raise
def get_dummy_accounts(): rest_handler() request_url = top_level_url + "api/dummyaccountslist/" req = urllib.request.Request(request_url) try: response = urllib.request.urlopen(req) answer = json.loads(response.read().decode( response.info().get_param('charset') or 'utf-8')) except HTTPError as e: print('get_dummy_accounts Error code: ', e.code) return None except URLError as e: print('get_dummy_accounts Reason: ', e.reason) return None else: return answer['data']
def get_subscription_number(): rest_handler() request_url = top_level_url + "api/subscriptionnumber" req = urllib.request.Request(request_url) try: response = urllib.request.urlopen(req) answer = json.loads(response.read().decode( response.info().get_param('charset') or 'utf-8')) except HTTPError as e: print('delete_user_by_id HTTPError code: ', e.code) return None except URLError as e: print('delete_user_by_id URLError Reason: ', e.reason) return None else: print(answer['data'][0]) return answer['data'][0]['subscriptionnumber']
def delete_user_by_id(idaccounts): rest_handler() request_url = top_level_url + "api/userdatabyid/" + idaccounts req = urllib.request.Request(request_url) req.method = 'DELETE' try: response = urllib.request.urlopen(req) answer = json.loads(response.read().decode( response.info().get_param('charset') or 'utf-8')) except HTTPError as e: print('delete_user_by_id HTTPError code: ', e.code) return False except URLError as e: print('delete_user_by_id URLError Reason: ', e.reason) return False else: print(answer['data']) return True
def query_weather_info(location, is_str): if is_str == True: query_url = OWM_URL + "q=" + location.strip() + OWM_API else: query_url = OWM_URL + "id=" + str(location) + OWM_API print("\nThe query url is : {}\n".format(query_url)) #print("The query url is :"% query_url) #The above statement throws this: TypeError: not all arguments converted during string formatting" #Find out why. try: response = urllib.request.urlopen(query_url) resp_data = response.read() encoding = response.info().get_content_charset('utf-8') resp_json = json.loads(resp_data.decode(encoding)) except urllib.error.URLError as e: print("\nError\n------\nException occurred!! Code: {} Reason: {}\n". format(e.code, e.reason)) pass return resp_json
def get_submit_videos_page(url): try: req = urllib.request.Request(url) # 设置请求头部信息 req.add_header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") req.add_header("Accept-Language", "zh-CN,en-US;q=0.7,en;q=0.3") req.add_header("Connection", "keep-alive") req.add_header("Host", "space.bilibili.com") req.add_header("User-Agent", "Mozilla/5.0 (X11; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0") req.add_header("Accept-Encoding", "gzip, deflate, br") # 请求参数 params = { "mid": "18199039", "page": "1", "pagesize": "100" } data = urllib.parse.urlencode(params).encode("utf-8") response = urllib.request.urlopen(req, data, timeout=10) if response.info()["Content-Encoding"] == "gzip": buf = io.BytesIO(response.read()) f = gzip.GzipFile(fileobj=buf) page_content = f.read().decode("utf-8") else: page_content = response.read().decode("utf-8") response.close() #print(page_content) return page_content except Exception as e: raise e finally: if response: response.close()
def crawl(self, link): tryOnce = 0 robotParser = self.setupRobotParser(link) if robotParser.can_fetch("*", link): while True: try: response = urllib.request.urlopen(link) break except urllib.error.HTTPError as e: if e.code == 429: if tryOnce == 1: print( 'Thread ' + str(self.crawlerID) + ': Too many requests: ' + link + ' returning.') return print('Thread ' + str(self.crawlerID) + ': Too many requests: ' + link + ' trying again in 120 seconds.') sleep(120) tryOnce = 1 else: return # for handling any other url errors except: print('Error opening link: ',link, " by thread : ", self.crawlerID) return returnedLink = response.geturl() if returnedLink != link: print('Thread ' + str(self.crawlerID) + ': Redirection:' + link + ' to ' + returnedLink + ' returning.') return urlInfo = response.info() dataType = urlInfo.get_content_type() if 'html' not in dataType: print('Thread ' + str(self.crawlerID) + ': Not HTML ' + link + ' returning.') return try: webContent = response.read().decode(response.headers.get_content_charset('utf-8')) except: print("Incomplete Read of web content due to a defective http server.") webContent = None if(webContent): Crawler.webpagesLock.acquire() if Crawler.webpagesSaved < NUMOFPAGES: Crawler.webpagesSaved += 1 else: print('Thread ' + str(self.crawlerID) + ': Page number limit reached ') Crawler.webpagesLock.release() return Crawler.webpagesLock.release() selector = None while True: try: selector = WebPages.select().where(WebPages.pageURL == returnedLink).exists() break except (OperationalError , sqlite3.OperationalError) as e: if 'binding' in str(e): break print('Thread ', self.crawlerID, ': Database busy, retrying. WebPagesTable') except: break if selector: print('Thread ' + str(self.crawlerID) + ': Updating webpage ' + link) while True: try: WebPages.update(pageContent=webContent).where( WebPages.pageURL == returnedLink).execute() break except (OperationalError, sqlite3.OperationalError) as e: if 'binding' in str(e): break print('Thread ', self.crawlerID, ': Database busy, retrying. WebPagesTable') except: break else: print('Thread ' + str(self.crawlerID) + ': Saving webpage ' + link ) try: inserted = False while True: try: if not inserted: WebPages(pageURL=returnedLink, pageContent=webContent).save() inserted = True ... PageRank.create(pageURL=returnedLink).update() ... break except (OperationalError, sqlite3.OperationalError) as e: if 'binding' in str(e): break print('Thread ', self.crawlerID, ': Database busy, retrying. WebPagesTable & PageRank') sleep(randint(1,5)) except: break #should never happen except: print('UnexpectedException: In saving webpage WEEEEEEEEEEEEEEEEEEEEEEE') print('Thread ' + str(self.crawlerID) + ': Done saving webpage and starting link extraction ' + link) try: parser = MyHTMLParser(link) parser.feed(str(webContent)) #should never happen except: print('UnexpectedException: in parser WEEEEEEEEEEEEEEEEEEEEEEE') size = 999 while True: try: for i in range(0, len(parser.links), size): UncrawledTable.insert_many(parser.links[i:i + size]).upsert().execute() break except (OperationalError, sqlite3.OperationalError) as e: if 'binding' in str(e): break print('Thread ', self.crawlerID, ': Database busy, retrying. UnCrawledTable') except: break while True: try: print("UNCRAWLED URLS = ", UncrawledTable.select().count(), ' Thread ' + str(self.crawlerID)) break except (OperationalError, sqlite3.OperationalError) as e: if 'binding' in str(e): break print('Thread ', self.crawlerID, ': Database busy, retrying. print UnCrawledTable') except: break print('Thread ' + str(self.crawlerID) + ': Done inserting links ' + link)
# # queryval = {'q': 'hello'} # # querystr = urllib.parse.urlencode(queryval) # # url = url + '?' + querystr # # req = urllib.request.Request(url, None, headers) # # # with urllib.request.urlopen(req) as response: # # # rtnpage = response.read().decode('utf-8') # # # print(rtnpage) # # try: # # urllib.request.urlopen(req) # # except urllib.error.HTTPError as e: # # print(e.code) # # print(e.read()) if __name__ == '__main__': with urllib.request.urlopen('http://python.org') as response: print('real url---', response.geturl()) for k, v in response.info().items(): print(k, '==', v) html = response.read() import subprocess import os.path if os.path.exists('tmp.html') is True and os.path.isfile( 'tmp.html') is True: print('remove tmp.html') os.remove('tmp.html') rtnv = subprocess.check_output(['touch', 'tmp.html']) with open('./tmp.html', 'r+') as filehandle: for line in html.splitlines(): filehandle.write(line.decode('utf-8')) filehandle.write('\n') filehandle.seek(1)
# encoding=UTF-8 import urllib.request as request import urllib.response import json import sys GossipingIndexUrl = "https://www.ptt.cc/bbs/Gossiping/index.html" #httpsConOpener = urllib.request.build_opener() #response = httpsConOpener.open(GossipingIndexUrl) response = urllib.request.urlopen(GossipingIndexUrl) resp_url = response.geturl() #resp_info = response.info() #resp_content = response.read().decode("UTF-8") #print(resp_url) #print(resp_info) #print(resp_content) if "ask/over18" in resp_url: print("ooops") post_req =request.Request(resp_url, b"yes:yes") response = request.urlopen(post_req) print(response.geturl()) print(response.info()) print(response.read()) print("end")
import re import requests payload = {'key1': 'value1', 'key2': 'value2'} ret = requests.post('http://httpbin.org/post', payload) print("#No.0001:") print(ret.text) url = 'https://www.baidu.com/' req = request.Request(url) response = request.urlopen(req) print("#No.1==>type of response:") content = response.read() con1 = response.readlines() con2 = response.info() con3 = response.getcode() con4 = response.geturl() print(content) print(con1, "\n", con2, "\n", con3, "\n", con4, "\n") url2 = 'http://blog.csdn.net/ritterliu/article/details/70243112' req2 = request.Request(url2) response2 = request.urlopen(req2) content2 = BeautifulSoup(response2.read(), "html5lib") print("#No.2==>", content2.title) print("#No.3==>", content2.find_all(name='h1')) namelist = content2.find_all(name='img') print("#No.4==>")
#!/usr/bin/python3 ''' takes in a URL, sends a request to the URL ''' if __name__ == "__main__": from urllib import request, response from sys import argv req = request.Request(argv[1]) with request.urlopen(req) as response: respuesta = response.info() print(respuesta["X-Request-Id"])