Пример #1
0
def parse_body(url):
    global retry
    try:
        rsp = requests.get(url,
                           headers=headers,
                           proxies=proxy.get(),
                           timeout=15)
    except Exception:
        retry += 1
        print("重爬次数:{}".format(retry))
        return parse_body(url)

    if rsp.status_code == 503:
        global ban
        ban += 1
        print("封IP次数:{}".format(ban))
        return parse_body(url)

    soup = BeautifulSoup(rsp.text, "html.parser")
    title = soup.find_all("h4")[-1].text
    body = soup.find(class_="x-wiki-content x-main-content")

    # 标题居中,h1 样式,添加到 body
    center_tag = soup.new_tag("center")
    title_tag = soup.new_tag("h1")
    title_tag.string = title
    center_tag.insert(0, title_tag)
    body.insert(0, center_tag)

    html = html_template.format(body=str(body))
    global count
    count += 1
    print("爬取页面成功数:{}".format(count))
    html = html.replace("data-src", "src")
    return html
Пример #2
0
def excuse(jenni, input):
    a = re.compile('<a [\s\S]+>(.*)</a>')
    try:
        page = proxy.get('http://programmingexcuses.com/')
    except:
        return jenni.say("I'm all out of excuses!")
    results = a.findall(page)
    if results:
        result = results[0]
        result = result.strip()
        if result[-1] not in ['.', '?', '!']:
            print 'lastchar:', str(result[-1])
            result += '.'
        jenni.say(result)
    else:
        jenni.say("I'm too lazy to find an excuse.")
Пример #3
0
def excuse(jenni, input):
    a = re.compile('<a [\s\S]+>(.*)</a>')
    try:
        page = proxy.get('http://programmingexcuses.com/')
    except:
        return jenni.say("I'm all out of excuses!")
    results = a.findall(page)
    if results:
        result = results[0]
        result = result.strip()
        if result[-1] not in ['.', '?', '!']:
            print 'lastchar:', str(result[-1])
            result += '.'
        jenni.say(result)
    else:
        jenni.say("I'm too lazy to find an excuse.")
Пример #4
0
import requests
import proxy

try:
    rsp = requests.get("https://www.baidu.com/",
                       proxies=proxy.get(),
                       timeout=15)
except Exception:
    pass
else:
    print(rsp.status_code)
Пример #5
0
def c(jenni, input):
    '''.c -- Google calculator.'''

    ## let's not bother if someone doesn't give us input
    if not input.group(2):
        return jenni.reply('Nothing to calculate.')

    ## handle some unicode conversions
    q = input.group(2).encode('utf-8')
    q = q.replace('\xcf\x95', 'phi')  # utf-8 U+03D5
    q = q.replace('\xcf\x80', 'pi')  # utf-8 U+03C0

    ## Attempt #1 (Google)
    uri = 'https://www.google.com/search?gbv=1&q='
    uri += web.urllib.quote(q)

    ## To the webs!
    try:
        page = proxy.get(uri)
    except:
        ## if we can't access Google for calculating
        ## let us move on to Attempt #2
        page = web.get(uri)

    answer = False
    if page:
        ## if we get a response from Google
        ## let us parse out an equation from Google Search results
        answer = c_answer.findall(page)

    if answer:
        ## if the regex finding found a match we want the first result
        answer = answer[0]
        answer = answer.encode('utf-8')
        answer = answer.decode('utf-8')
        answer = ''.join(chr(ord(c)) for c in answer)
        answer = uc.decode(answer)
        answer = answer.replace('<sup>', '^(')
        answer = answer.replace('</sup>', ')')
        answer = web.decode(answer)
        answer = answer.strip()
        answer += ' [GC]'
        jenni.say(answer)
    else:
        #### Attempt #2 (DuckDuckGo's API)
        ddg_uri = 'https://api.duckduckgo.com/?format=json&q='
        ddg_uri += urllib.quote(q)

        ## Try to grab page (results)
        ## If page can't be accessed, we shall fail!
        try:
            page = proxy.get(ddg_uri)
        except:
            page = web.get(ddg_uri)

        ## Try to take page source and json-ify it!
        try:
            json_response = json.loads(page)
        except:
            ## if it can't be json-ified, then we shall fail!
            json_response = None

        ## Check for 'AnswerType' (stolen from search.py)
        ## Also 'fail' to None so we can move on to Attempt #3
        if (not json_response) or (hasattr(json_response, 'AnswerType')
                                   and json_response['AnswerType'] != 'calc'):
            answer = None
        else:
            ## If the json contains an Answer that is the result of 'calc'
            ## then continue
            answer = json_response['Answer']
            parts = answer.split('</style>')
            answer = ''.join(parts[1:])
            answer = re.sub(r'<.*?>', '', answer).strip()

        if answer:
            ## If we have found answer with Attempt #2
            ## go ahead and display it
            answer += ' [DDG API]'
            jenni.say(answer)

        else:
            #### Attempt #3 (Wolfram Alpha)
            status, answer = get_wa(q)

            if status:
                jenni.say(answer + ' [WA]')

            else:
                #### Attempt #4 (DuckDuckGo's HTML)
                ## This relies on BeautifulSoup; if it can't be found, don't even bother
                try:
                    from BeautifulSoup import BeautifulSoup
                except:
                    return jenni.say(
                        'No results. (Please install BeautifulSoup for additional checking.)'
                    )

                new_url = 'https://duckduckgo.com/html/?q=%s&kl=us-en&kp=-1' % (
                    web.urllib.quote(q))
                try:
                    ddg_html_page = proxy.get(new_url)
                except:
                    ddg_html_page = web.get(new_url)
                soup = BeautifulSoup(ddg_html_page)

                ## use BeautifulSoup to parse HTML for an answer
                zero_click = str()
                if soup('div', {'class': 'zero-click-result'}):
                    zero_click = str(
                        soup('div', {'class': 'zero-click-result'})[0])

                ## remove some excess text
                output = r_tag.sub('', zero_click).strip()
                output = output.replace('\n', '').replace('\t', '')

                ## test to see if the search module has 'remove_spaces'
                ## otherwise, let us fail
                try:
                    output = search.remove_spaces(output)
                except:
                    output = str()

                if output:
                    ## If Attempt #4 worked, display the answer
                    jenni.say(output + ' [DDG HTML]')
                else:
                    ## If we made it this far, we have tried all available resources
                    jenni.say('Absolutely no results!')
Пример #6
0
def c(jenni, input):
    '''.c -- Google calculator.'''

    ## let's not bother if someone doesn't give us input
    if not input.group(2):
        return jenni.reply('Nothing to calculate.')

    ## handle some unicode conversions
    q = input.group(2).encode('utf-8')
    q = q.replace('\xcf\x95', 'phi')  # utf-8 U+03D5
    q = q.replace('\xcf\x80', 'pi')  # utf-8 U+03C0

    ## Attempt #1 (Google)
    uri = 'https://www.google.com/search?gbv=1&q='
    uri += web.urllib.quote(q)

    ## To the webs!
    try:
        page = proxy.get(uri)
    except:
        ## if we can't access Google for calculating
        ## let us move on to Attempt #2
        page = web.get(uri)

    answer = False
    if page:
        ## if we get a response from Google
        ## let us parse out an equation from Google Search results
        answer = c_answer.findall(page)

    if answer:
        ## if the regex finding found a match we want the first result
        answer = answer[0]
        #answer = answer.replace(u'\xc2\xa0', ',')
        answer = answer.encode('unicode-escape')
        answer = answer.decode('unicode-escape')
        answer = ''.join(chr(ord(c)) for c in answer)
        answer = uc.decode(answer)
        answer = answer.replace('<sup>', '^(')
        answer = answer.replace('</sup>', ')')
        answer = web.decode(answer)
        answer = answer.strip()
        answer += ' [GC]'
        jenni.say(answer)
    else:
        #### Attempt #2 (DuckDuckGo's API)
        ddg_uri = 'https://api.duckduckgo.com/?format=json&q='
        ddg_uri += urllib.quote(q)

        ## Try to grab page (results)
        ## If page can't be accessed, we shall fail!
        try:
            page = proxy.get(ddg_uri)
        except:
            page = web.get(ddg_uri)

        ## Try to take page source and json-ify it!
        try:
            json_response = json.loads(page)
        except:
            ## if it can't be json-ified, then we shall fail!
            json_response = None

        ## Check for 'AnswerType' (stolen from search.py)
        ## Also 'fail' to None so we can move on to Attempt #3
        if (not json_response) or (hasattr(json_response, 'AnswerType') and json_response['AnswerType'] != 'calc'):
            answer = None
        else:
            ## If the json contains an Answer that is the result of 'calc'
            ## then continue
            answer = re.sub(r'\<.*?\>', '', json_response['Answer']).strip()

        if answer:
            ## If we have found answer with Attempt #2
            ## go ahead and display it
            answer += ' [DDG API]'
            jenni.say(answer)

        else:
            #### Attempt #3 (Wolfram Alpha)
            status, answer = get_wa(q)

            if status:
                jenni.say(answer + ' [WA]')

            else:
                #### Attempt #4 (DuckDuckGo's HTML)
                ## This relies on BeautifulSoup; if it can't be found, don't even bother
                try:
                    from BeautifulSoup import BeautifulSoup
                except:
                    return jenni.say('No results. (Please install BeautifulSoup for additional checking.)')

                new_url = 'https://duckduckgo.com/html/?q=%s&kl=us-en&kp=-1' % (web.urllib.quote(q))
                try:
                    ddg_html_page = proxy.get(new_url)
                except:
                    ddg_html_page = web.get(new_url)
                soup = BeautifulSoup(ddg_html_page)

                ## use BeautifulSoup to parse HTML for an answer
                zero_click = str()
                if soup('div', {'class': 'zero-click-result'}):
                    zero_click = str(soup('div', {'class': 'zero-click-result'})[0])

                ## remove some excess text
                output = r_tag.sub('', zero_click).strip()
                output = output.replace('\n', '').replace('\t', '')

                ## test to see if the search module has 'remove_spaces'
                ## otherwise, let us fail
                try:
                    output = search.remove_spaces(output)
                except:
                    output = str()

                if output:
                    ## If Attempt #4 worked, display the answer
                    jenni.say(output + ' [DDG HTML]')
                else:
                    ## If we made it this far, we have tried all available resources
                    jenni.say('Absolutely no results!')
Пример #7
0
		url = "https://www.lectio.dk/lectio/%s/ExerciseFileGet.aspx?type=opgavedef&exercisefileid=%s" % ( config["school_id"], config["excercise_description_file"] )

	if session is False:
		session = authenticate.authenticate(config)

	cookies = {
		"lecmobile" : "0",
		"ASP.NET_SessionId" : session["ASP.NET_SessionId"],
		"LastLoginUserName" : session["LastLoginUserName"],
		"lectiogsc" : session["lectiogsc"],
		"LectioTicket" : session["LectioTicket"]
	}

	# Insert User-agent headers and the cookie information
	headers = {
		"User-Agent" : "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1665.2 Safari/537.36",
		"Content-Type" : "application/x-www-form-urlencoded",
		"Host" : "www.lectio.dk",
		"Origin" : "https://www.lectio.dk",
		"Cookie" : functions.implode(cookies, "{{index}}={{value}}", "; ")
	}

	r = proxy.get(url, stream=True)

	file_name = config["file_name"]

    with open(file_name, 'wb') as f:
        for chunk in r.iter_content(chunk_size=1024): 
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)
                f.flush()