def parse_body(url): global retry try: rsp = requests.get(url, headers=headers, proxies=proxy.get(), timeout=15) except Exception: retry += 1 print("重爬次数:{}".format(retry)) return parse_body(url) if rsp.status_code == 503: global ban ban += 1 print("封IP次数:{}".format(ban)) return parse_body(url) soup = BeautifulSoup(rsp.text, "html.parser") title = soup.find_all("h4")[-1].text body = soup.find(class_="x-wiki-content x-main-content") # 标题居中,h1 样式,添加到 body center_tag = soup.new_tag("center") title_tag = soup.new_tag("h1") title_tag.string = title center_tag.insert(0, title_tag) body.insert(0, center_tag) html = html_template.format(body=str(body)) global count count += 1 print("爬取页面成功数:{}".format(count)) html = html.replace("data-src", "src") return html
def excuse(jenni, input): a = re.compile('<a [\s\S]+>(.*)</a>') try: page = proxy.get('http://programmingexcuses.com/') except: return jenni.say("I'm all out of excuses!") results = a.findall(page) if results: result = results[0] result = result.strip() if result[-1] not in ['.', '?', '!']: print 'lastchar:', str(result[-1]) result += '.' jenni.say(result) else: jenni.say("I'm too lazy to find an excuse.")
import requests import proxy try: rsp = requests.get("https://www.baidu.com/", proxies=proxy.get(), timeout=15) except Exception: pass else: print(rsp.status_code)
def c(jenni, input): '''.c -- Google calculator.''' ## let's not bother if someone doesn't give us input if not input.group(2): return jenni.reply('Nothing to calculate.') ## handle some unicode conversions q = input.group(2).encode('utf-8') q = q.replace('\xcf\x95', 'phi') # utf-8 U+03D5 q = q.replace('\xcf\x80', 'pi') # utf-8 U+03C0 ## Attempt #1 (Google) uri = 'https://www.google.com/search?gbv=1&q=' uri += web.urllib.quote(q) ## To the webs! try: page = proxy.get(uri) except: ## if we can't access Google for calculating ## let us move on to Attempt #2 page = web.get(uri) answer = False if page: ## if we get a response from Google ## let us parse out an equation from Google Search results answer = c_answer.findall(page) if answer: ## if the regex finding found a match we want the first result answer = answer[0] answer = answer.encode('utf-8') answer = answer.decode('utf-8') answer = ''.join(chr(ord(c)) for c in answer) answer = uc.decode(answer) answer = answer.replace('<sup>', '^(') answer = answer.replace('</sup>', ')') answer = web.decode(answer) answer = answer.strip() answer += ' [GC]' jenni.say(answer) else: #### Attempt #2 (DuckDuckGo's API) ddg_uri = 'https://api.duckduckgo.com/?format=json&q=' ddg_uri += urllib.quote(q) ## Try to grab page (results) ## If page can't be accessed, we shall fail! try: page = proxy.get(ddg_uri) except: page = web.get(ddg_uri) ## Try to take page source and json-ify it! try: json_response = json.loads(page) except: ## if it can't be json-ified, then we shall fail! json_response = None ## Check for 'AnswerType' (stolen from search.py) ## Also 'fail' to None so we can move on to Attempt #3 if (not json_response) or (hasattr(json_response, 'AnswerType') and json_response['AnswerType'] != 'calc'): answer = None else: ## If the json contains an Answer that is the result of 'calc' ## then continue answer = json_response['Answer'] parts = answer.split('</style>') answer = ''.join(parts[1:]) answer = re.sub(r'<.*?>', '', answer).strip() if answer: ## If we have found answer with Attempt #2 ## go ahead and display it answer += ' [DDG API]' jenni.say(answer) else: #### Attempt #3 (Wolfram Alpha) status, answer = get_wa(q) if status: jenni.say(answer + ' [WA]') else: #### Attempt #4 (DuckDuckGo's HTML) ## This relies on BeautifulSoup; if it can't be found, don't even bother try: from BeautifulSoup import BeautifulSoup except: return jenni.say( 'No results. (Please install BeautifulSoup for additional checking.)' ) new_url = 'https://duckduckgo.com/html/?q=%s&kl=us-en&kp=-1' % ( web.urllib.quote(q)) try: ddg_html_page = proxy.get(new_url) except: ddg_html_page = web.get(new_url) soup = BeautifulSoup(ddg_html_page) ## use BeautifulSoup to parse HTML for an answer zero_click = str() if soup('div', {'class': 'zero-click-result'}): zero_click = str( soup('div', {'class': 'zero-click-result'})[0]) ## remove some excess text output = r_tag.sub('', zero_click).strip() output = output.replace('\n', '').replace('\t', '') ## test to see if the search module has 'remove_spaces' ## otherwise, let us fail try: output = search.remove_spaces(output) except: output = str() if output: ## If Attempt #4 worked, display the answer jenni.say(output + ' [DDG HTML]') else: ## If we made it this far, we have tried all available resources jenni.say('Absolutely no results!')
def c(jenni, input): '''.c -- Google calculator.''' ## let's not bother if someone doesn't give us input if not input.group(2): return jenni.reply('Nothing to calculate.') ## handle some unicode conversions q = input.group(2).encode('utf-8') q = q.replace('\xcf\x95', 'phi') # utf-8 U+03D5 q = q.replace('\xcf\x80', 'pi') # utf-8 U+03C0 ## Attempt #1 (Google) uri = 'https://www.google.com/search?gbv=1&q=' uri += web.urllib.quote(q) ## To the webs! try: page = proxy.get(uri) except: ## if we can't access Google for calculating ## let us move on to Attempt #2 page = web.get(uri) answer = False if page: ## if we get a response from Google ## let us parse out an equation from Google Search results answer = c_answer.findall(page) if answer: ## if the regex finding found a match we want the first result answer = answer[0] #answer = answer.replace(u'\xc2\xa0', ',') answer = answer.encode('unicode-escape') answer = answer.decode('unicode-escape') answer = ''.join(chr(ord(c)) for c in answer) answer = uc.decode(answer) answer = answer.replace('<sup>', '^(') answer = answer.replace('</sup>', ')') answer = web.decode(answer) answer = answer.strip() answer += ' [GC]' jenni.say(answer) else: #### Attempt #2 (DuckDuckGo's API) ddg_uri = 'https://api.duckduckgo.com/?format=json&q=' ddg_uri += urllib.quote(q) ## Try to grab page (results) ## If page can't be accessed, we shall fail! try: page = proxy.get(ddg_uri) except: page = web.get(ddg_uri) ## Try to take page source and json-ify it! try: json_response = json.loads(page) except: ## if it can't be json-ified, then we shall fail! json_response = None ## Check for 'AnswerType' (stolen from search.py) ## Also 'fail' to None so we can move on to Attempt #3 if (not json_response) or (hasattr(json_response, 'AnswerType') and json_response['AnswerType'] != 'calc'): answer = None else: ## If the json contains an Answer that is the result of 'calc' ## then continue answer = re.sub(r'\<.*?\>', '', json_response['Answer']).strip() if answer: ## If we have found answer with Attempt #2 ## go ahead and display it answer += ' [DDG API]' jenni.say(answer) else: #### Attempt #3 (Wolfram Alpha) status, answer = get_wa(q) if status: jenni.say(answer + ' [WA]') else: #### Attempt #4 (DuckDuckGo's HTML) ## This relies on BeautifulSoup; if it can't be found, don't even bother try: from BeautifulSoup import BeautifulSoup except: return jenni.say('No results. (Please install BeautifulSoup for additional checking.)') new_url = 'https://duckduckgo.com/html/?q=%s&kl=us-en&kp=-1' % (web.urllib.quote(q)) try: ddg_html_page = proxy.get(new_url) except: ddg_html_page = web.get(new_url) soup = BeautifulSoup(ddg_html_page) ## use BeautifulSoup to parse HTML for an answer zero_click = str() if soup('div', {'class': 'zero-click-result'}): zero_click = str(soup('div', {'class': 'zero-click-result'})[0]) ## remove some excess text output = r_tag.sub('', zero_click).strip() output = output.replace('\n', '').replace('\t', '') ## test to see if the search module has 'remove_spaces' ## otherwise, let us fail try: output = search.remove_spaces(output) except: output = str() if output: ## If Attempt #4 worked, display the answer jenni.say(output + ' [DDG HTML]') else: ## If we made it this far, we have tried all available resources jenni.say('Absolutely no results!')
url = "https://www.lectio.dk/lectio/%s/ExerciseFileGet.aspx?type=opgavedef&exercisefileid=%s" % ( config["school_id"], config["excercise_description_file"] ) if session is False: session = authenticate.authenticate(config) cookies = { "lecmobile" : "0", "ASP.NET_SessionId" : session["ASP.NET_SessionId"], "LastLoginUserName" : session["LastLoginUserName"], "lectiogsc" : session["lectiogsc"], "LectioTicket" : session["LectioTicket"] } # Insert User-agent headers and the cookie information headers = { "User-Agent" : "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1665.2 Safari/537.36", "Content-Type" : "application/x-www-form-urlencoded", "Host" : "www.lectio.dk", "Origin" : "https://www.lectio.dk", "Cookie" : functions.implode(cookies, "{{index}}={{value}}", "; ") } r = proxy.get(url, stream=True) file_name = config["file_name"] with open(file_name, 'wb') as f: for chunk in r.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks f.write(chunk) f.flush()