Пример #1
0
    def annotateResponse(self, response):
        info = response.info()
        try:
            mimeType, sep, mimeInfo = info["Content-Type"].partition(";")
            m = self.charsetRE.search(mimeInfo)
            if m is not None:
                encoding = m.group(1)
            else:
                encoding = None
            mimeType = mimeType.strip()
        except AttributeError:
            mimeType = "unknown/unknown"
            encoding = None
        except KeyError:
            mimeType = "unknown/unknown"
            encoding = None

        try:
            response.handler = self.mimeMap[mimeType]
        except KeyError:
            print("fallback")
            for glob, handler in self.globMap:
                if fnmatch(mimeType, glob):
                    response.handler = handler
                    break
            else:
                raise URLLookupError("No handler for MIME type: {0}")

        response.mimeType = mimeType
        response.encoding = encoding
        self.bufferResponse(response, info)
        response.url = urllib.parse.urlparse(response.geturl())
Пример #2
0
    def login(self):
        """login web.
        
        """
        # set Cookie Jar.
        cookie_jar = http.cookiejar.LWPCookieJar()
        cookie_support = urllib.request.HTTPCookieProcessor(cookie_jar)
        opener = urllib.request.build_opener(cookie_support, urllib.request.HTTPHandler)
        urllib.request.install_opener(opener)

        # get validate code.
        code = False
        #while not code:
        code = self.validate_code()
        print("Validate Code: %s" % code)

        # post request data.
        login_url = "http://%s/UserAction.do" % self.host
        data = urlencode({"method": "login", "userid": self.uid, "passWord": self.password, "validateCode": code})
        request = urllib.request.Request(login_url, data.encode())

        try:  
            response = urllib.request.urlopen(request)
        except (URLError, HTTPError):
            self.write_log("Error: Access Login URL fail.", "%s")
            return
        else:
            if response.geturl().find("welcome.jsp") > 0:
                self.write_log(("Login feedback3 OK.", self.uid),  "%s (%s)")
                return True
            else:
                self.write_log(("Error: Login fail.", self.uid), "%s (%s)")
                return False
Пример #3
0
    def _process_uncached_url(self, url, no_description=False):
        request = urllib.request.Request(url, headers={
            "User-Agent": self.userAgent,
            "Accept": self.acceptHeader
        })
        try:
            startTime = time.time()
            response = urllib.request.urlopen(request, timeout=self.timeout)
            timeTaken = time.time() - startTime
        except socket.timeout:
            raise URLLookupError("Timed out")
        except urllib.error.URLError as err:
            raise URLLookupError(err.reason)
        except Exception as err:
            raise URLLookupError(type(err).__name__)
        try:

            newURL = response.geturl()
            if newURL != url and self.showRedirects:
                yield "→ <{0}>".format(newURL)

            self.annotateResponse(response)

            if response.contentLength is not None:
                sizeFormatted = formatBytes(response.contentLength)
            else:
                sizeFormatted = "unknown size"

            responseIter = iter(response.handler.processResponse(response, no_description=no_description))
            firstLine = next(responseIter)
            for line in self.responseFormats:
                yield line.format(time=timeTaken,
                        size=sizeFormatted,
                        plugin=firstLine)
            for line in responseIter:
                yield line
        finally:
            response.close()
            del response.buf
            del response
Пример #4
0
def login(browser):
    browser.open('https://mbasic.facebook.com/login.php')
    browser.select_form(nr=0)
    browser.form['email'] = emailw
    browser.form['pass'] = passw
    if "checkpoint" in browser.submit().geturl():
        print("[Checkpoint] Acc đã bị checkpoint 2fa")
        if token2fa:
            browser.select_form(nr=0)
            browser.form['approvals_code'] = requests.get("https://2fa.live/tok/"+token2fa).json()['token']
            browser.submit()
            browser.select_form(nr=0)
            browser.form.find_control(name="name_action_selected").value = ['dont_save',]
            response=browser.submit()
            if "checkpoint" in response.geturl():
                browser.select_form(nr=0)
                response=browser.submit()
                browser.select_form(nr=0)
                response=browser.submit(name='submit[This was me]')
                browser.select_form(nr=0)
                browser.form.find_control(name="name_action_selected").value = ['dont_save',]
                response=browser.submit()
    return response
Пример #5
0
payload = {'key1': 'value1', 'key2': 'value2'}
ret = requests.post('http://httpbin.org/post', payload)
print("#No.0001:")
print(ret.text)

url = 'https://www.baidu.com/'

req = request.Request(url)
response = request.urlopen(req)
print("#No.1==>type of response:")
content = response.read()
con1 = response.readlines()
con2 = response.info()
con3 = response.getcode()
con4 = response.geturl()
print(content)
print(con1, "\n", con2, "\n", con3, "\n", con4, "\n")

url2 = 'http://blog.csdn.net/ritterliu/article/details/70243112'
req2 = request.Request(url2)
response2 = request.urlopen(req2)
content2 = BeautifulSoup(response2.read(), "html5lib")

print("#No.2==>", content2.title)
print("#No.3==>", content2.find_all(name='h1'))

namelist = content2.find_all(name='img')
print("#No.4==>")
for name in namelist:
    print(name)
Пример #6
0
def get_content(channel_id, config=None):
    channel = Channel.get(channel_id)
    logger = get_logger('rss', channel)
    if not config:

        def get_param(x):
            return channel.get_config_param(x)
    else:

        def get_param(x):
            return config[x]

    url = get_param('url')
    parser_rules = get_param(
        'parser_rules'
    )  # A list of rules in the form slide_element;entry_item;regexp
    additional_rules = get_param(
        'additional_rules')  # A list of rules in the form entry_item;string
    filter = get_param('filter')
    exception_rules = get_param(
        'exception_rules')  # A list of rules in the form entry_item;string
    no_slides = get_param('no_slides')
    time_limit = get_param('time_limit')
    duration = get_param('duration') * 1000
    template = get_param('template')
    theme = get_param('theme')
    min_age = datetime.now() - timedelta(days=time_limit)

    entries = feedparser_parse(
        url)['entries'][:no_slides] if not config or not config.get(
            'feed') else config.get('feed')
    capsules = []
    last_entries = []

    with open(cache_path, 'r+') as f:
        fcntl.flock(f, fcntl.LOCK_EX)
        cache = json.load(f, cls=DateTimeDecoder)

        for entry in entries:
            if 'published_parsed' in entry:
                entry_age = datetime.fromtimestamp(
                    mktime(entry['published_parsed']))
                if entry_age >= min_age:
                    last_entries.append(entry)
            else:
                entry_hash = hash_dict(entry)
                if entry_hash in cache:
                    if cache[entry_hash] >= min_age:
                        last_entries.append(entry)
                else:
                    cache[entry_hash] = datetime.now()
        f.seek(0)
        json.dump(cache, f, cls=DateTimeEncoder)
        f.truncate()
        fcntl.flock(f, fcntl.LOCK_UN)

    for entry in last_entries:
        slide_content = {}
        link_page = None
        for slide_element, entry_item, regexp in [
                rule.split(';') for rule in parser_rules
        ]:
            field, input_type = slide_element.split(':')
            if field not in slide_content:
                slide_content[field] = {}

            if entry_item == 'link_page':
                if not link_page:
                    with urllib.request.urlopen(entry.link) as response:
                        link_page = response.read().decode(errors='ignore')
                        entry_item += "@" + response.geturl(
                        )  # in case of redirect(s), geturl() returns the final url of the page
                item = link_page
            else:
                item = deep_get(entry, *entry_item.split('.'))

            value = get_value(item, regexp)
            if input_type == 'src' and not _is_url(value):
                ref_url = entry.link
                if entry_item.startswith('link_page@'):
                    ref_url = entry_item.split('@')[1]
                value = urljoin(ref_url, value)

            slide_content[field].update({input_type: value})

        for slide_element, string in [
                rule.split(';') for rule in additional_rules
        ]:
            field, input_type = slide_element.split(':')
            if field not in slide_content:
                slide_content[field] = {}
            if string.lower() == 'qrcode':
                input_type = 'qrcode'
                string = entry.link
            slide_content[field].update({input_type: string})

        if len(exception_rules) == 1 and not exception_rules[0].strip():
            capsules.append(
                RssCapsule(theme=theme,
                           slides=[
                               RssSlide(content=slide_content,
                                        template=template,
                                        duration=duration)
                           ]))
        else:
            for entry_item, regexp in [
                    rule.split(';') for rule in exception_rules
            ]:
                if entry_item == 'link_page':
                    if not link_page:
                        with urllib.request.urlopen(entry.link) as response:
                            link_page = response.read().decode(errors='ignore')
                            entry_item += "@" + response.geturl(
                            )  # in case of redirect(s), geturl() returns the final url of the page
                    item = link_page
                else:
                    item = deep_get(entry, *entry_item.split('.'))

                value = get_value(item, regexp)

                if filter and value is None:
                    capsules.append(
                        RssCapsule(theme=theme,
                                   slides=[
                                       RssSlide(content=slide_content,
                                                template=template,
                                                duration=duration)
                                   ]))

                if not filter and value is not None:
                    capsules.append(
                        RssCapsule(theme=theme,
                                   slides=[
                                       RssSlide(content=slide_content,
                                                template=template,
                                                duration=duration)
                                   ]))

    return capsules
Пример #7
0
    def crawl(self, link):

        tryOnce = 0
        robotParser = self.setupRobotParser(link)
        if robotParser.can_fetch("*", link):
            while True:
                try:
                    response = urllib.request.urlopen(link)
                    break
                except urllib.error.HTTPError as e:
                    if e.code == 429:
                        if tryOnce == 1:
                            print(
                                'Thread ' + str(self.crawlerID) + ': Too many requests: ' + link + ' returning.')
                            return
                        print('Thread ' + str(self.crawlerID) + ': Too many requests: ' + link + ' trying again in 120 seconds.')
                        sleep(120)
                        tryOnce = 1
                    else:
                        return
                # for handling any other url errors
                except:
                    print('Error opening link: ',link, " by thread : ", self.crawlerID)

                    return

            returnedLink = response.geturl()
            if returnedLink != link:
                print('Thread ' + str(self.crawlerID) + ': Redirection:' + link + ' to ' + returnedLink + ' returning.')
                return

            urlInfo = response.info()
            dataType = urlInfo.get_content_type()
            if 'html' not in dataType:
                print('Thread ' + str(self.crawlerID) + ': Not HTML ' + link + ' returning.')
                return

            try:
                webContent = response.read().decode(response.headers.get_content_charset('utf-8'))
            except:
                print("Incomplete Read of web content due to a defective http server.")
                webContent = None

            if(webContent):
                Crawler.webpagesLock.acquire()
                if Crawler.webpagesSaved < NUMOFPAGES:
                    Crawler.webpagesSaved += 1
                else:
                    print('Thread ' + str(self.crawlerID) + ': Page number limit reached ')
                    Crawler.webpagesLock.release()
                    return
                Crawler.webpagesLock.release()
                selector = None
                while True:
                    try:
                        selector = WebPages.select().where(WebPages.pageURL == returnedLink).exists()
                        break
                    except (OperationalError , sqlite3.OperationalError) as e:
                        if 'binding' in str(e):
                            break
                        print('Thread ', self.crawlerID, ': Database busy, retrying. WebPagesTable')
                    except:
                        break

                if selector:
                    print('Thread ' + str(self.crawlerID) + ': Updating webpage ' + link)

                    while True:
                        try:
                            WebPages.update(pageContent=webContent).where(
                                WebPages.pageURL == returnedLink).execute()
                            break
                        except (OperationalError, sqlite3.OperationalError) as e:
                            if 'binding' in str(e):
                                break
                            print('Thread ', self.crawlerID, ': Database busy, retrying. WebPagesTable')
                        except:
                            break

                else:
                    print('Thread ' + str(self.crawlerID) + ': Saving webpage ' + link )
                    try:
                        inserted = False
                        while True:
                            try:
                                if not inserted:
                                    WebPages(pageURL=returnedLink, pageContent=webContent).save()
                                    inserted =  True
                                ...
                                PageRank.create(pageURL=returnedLink).update()
                                ...
                                break
                            except (OperationalError, sqlite3.OperationalError) as e:
                                if 'binding' in str(e):
                                    break
                                print('Thread ', self.crawlerID, ': Database busy, retrying. WebPagesTable & PageRank')
                                sleep(randint(1,5))

                            except:
                                break
                    #should never happen
                    except:
                        print('UnexpectedException: In saving webpage WEEEEEEEEEEEEEEEEEEEEEEE')

                print('Thread ' + str(self.crawlerID) + ': Done saving webpage and starting link extraction ' + link)
                try:
                    parser = MyHTMLParser(link)
                    parser.feed(str(webContent))
                #should never happen
                except:
                    print('UnexpectedException: in parser WEEEEEEEEEEEEEEEEEEEEEEE')

                size = 999
                while True:
                    try:
                        for i in range(0, len(parser.links), size):
                            UncrawledTable.insert_many(parser.links[i:i + size]).upsert().execute()
                        break
                    except (OperationalError, sqlite3.OperationalError) as e:
                        if 'binding' in str(e):
                            break
                        print('Thread ', self.crawlerID, ': Database busy, retrying. UnCrawledTable')
                    except:
                        break

                while True:
                    try:
                        print("UNCRAWLED URLS = ", UncrawledTable.select().count(), ' Thread ' + str(self.crawlerID))
                        break
                    except (OperationalError, sqlite3.OperationalError) as e:
                        if 'binding' in str(e):
                            break
                        print('Thread ', self.crawlerID, ': Database busy, retrying. print UnCrawledTable')
                    except:
                        break

                print('Thread ' + str(self.crawlerID) + ': Done inserting links ' + link)
Пример #8
0
#     #     headers = {'User-Agent': user_agent}
#     #     queryval = {'q': 'hello'}
#     #     querystr = urllib.parse.urlencode(queryval)
#     #     url = url + '?' + querystr
#     #     req = urllib.request.Request(url, None, headers)
#     #     # with urllib.request.urlopen(req) as response:
#     #     #     rtnpage = response.read().decode('utf-8')
#     #     #     print(rtnpage)
#     #     try:
#     #         urllib.request.urlopen(req)
#     #     except urllib.error.HTTPError as e:
#     #         print(e.code)
#     #         print(e.read())
if __name__ == '__main__':
    with urllib.request.urlopen('http://python.org') as response:
        print('real url---', response.geturl())
        for k, v in response.info().items():
            print(k, '==', v)
        html = response.read()
        import subprocess
        import os.path

        if os.path.exists('tmp.html') is True and os.path.isfile(
                'tmp.html') is True:
            print('remove tmp.html')
            os.remove('tmp.html')
        rtnv = subprocess.check_output(['touch', 'tmp.html'])
        with open('./tmp.html', 'r+') as filehandle:
            for line in html.splitlines():
                filehandle.write(line.decode('utf-8'))
                filehandle.write('\n')
Пример #9
0
#!/usr/bin/python
#!/usr/bin/env python
# encoding=UTF-8

import urllib.request as request
import urllib.response
import json
import sys

GossipingIndexUrl = "https://www.ptt.cc/bbs/Gossiping/index.html"

#httpsConOpener = urllib.request.build_opener()
#response = httpsConOpener.open(GossipingIndexUrl)
response = urllib.request.urlopen(GossipingIndexUrl)

resp_url = response.geturl()
#resp_info = response.info()
#resp_content = response.read().decode("UTF-8")

#print(resp_url)
#print(resp_info)
#print(resp_content)

if "ask/over18" in resp_url:
	print("ooops")
	post_req =request.Request(resp_url, b"yes:yes")
	response = request.urlopen(post_req)
	print(response.geturl())
	print(response.info())
	print(response.read())
Пример #10
0
# -*- coding:utf-8 -*-
import urllib.request
import urllib.response


class RedirectHandler(urllib.request.HTTPRedirectHandler):
    def http_error_301(self, req, fp, code, msg, headers):
        pass

    def http_error_302(self, req, fp, code, msg, headers):
        result = urllib.request.HTTPRedirectHandler.http_error_302(
            self, req, fp, code, msg, headers)
        result.status = code
        result.newurl = result.geturl()
        return result


opener = urllib.request.build_opener(RedirectHandler)
url = "http://www.baidu.com"
response = opener.open(url)
data = response.read().decode()
print(data)
print(response.geturl())