Exemplo n.º 1
0
    def __init__(self,
                 url,
                 method='GET',
                 headers=None,
                 cookies=None,
                 refer=None,
                 data=None,
                 user_agent=None,
                 **kwargs):
        if isinstance(url, URL):
            self._url = url
        else:
            self._url = URL(url)

        self._method = method.upper()
        self._headers = {}
        self._cookies = cookies
        self._refer = refer
        self._user_agent = user_agent
        if self._cookies:
            headers.update({"Cookie": self._cookies})
        if self._refer:
            self._headers.update({"Referer": self._refer})
        if self._user_agent:
            self._headers.update({"User-Agent": self._user_agent})
        self._get_date = self._url.get_querystring()

        if data:
            self._post_data = data
Exemplo n.º 2
0
    def _find_header_urls(self, headers):

        for key, value in headers.items():
            if key in self.URL_HEADERS:
                if value.startwith('http'):
                    url = URL(value, encoding=self._encoding)
                else:
                    url = self._base_url.urljoin(value).url_string
                    url = URL(url, encoding=self._encoding)
            self._tag_urls.add(url)
Exemplo n.º 3
0
    def _find_regex_urls(self, doc_str):

        re_urls = set()

        for url in re.findall(HtmlParser.URL_RE, doc_str):

            try:
                url = URL(url[0], encoding=self._encoding)
            except ValueError:
                pass
            else:
                re_urls.add(url)

        def find_relative(doc_str):
            res = set()
            regex = ''  #fix me this line

            relative_regex = re.compile(regex, re.U | re.I)

            for match_truple in relative_regex.findall(doc_str):

                match_str = match_truple[0]
                url = self._base_url.join_url(match_str).url_string
                url = URL(url, encoding=self._encoding)
                res.add(url)
            return res

        re_urls.update(find_relative(doc_str))
        self._re_urls.update(re_urls)
Exemplo n.º 4
0
    def _find_tag_urls(self, tag, attrs):

        for attr_name, attr_value in attrs.iteritems():

            if attr_name in self.URL_ATTRS and attr_value and not attr_value.startwith(
                    '#'):

                try:

                    if attr_value.startwith('http'):
                        url = URL(attr_value, encoding=self._encoding)
                    else:
                        url = self._base_url.urljoin(attr_value).url_string
                        url = URL(url, encoding=self._encoding)
                except ValueError:
                    pass

                else:
                    self._tag_urls.add(url)
Exemplo n.º 5
0
        def find_relative(doc_str):
            res = set()
            regex = ''  #fix me this line

            relative_regex = re.compile(regex, re.U | re.I)

            for match_truple in relative_regex.findall(doc_str):

                match_str = match_truple[0]
                url = self._base_url.join_url(match_str).url_string
                url = URL(url, encoding=self._encoding)
                res.add(url)
            return res
Exemplo n.º 6
0
 def post(self, url, headers={}, data=None, **kwargs):
     default_headers = self.get_default_headers(headers)
     if not isinstance(url, URL):
         url = URL(url)
     requests_response = None
     try:
         requests_response = requests.post(url.url_string,
                                           headers=default_headers,
                                           **kwargs)
     except:
         return self._make_response(requests_response, url)
     response = self._make_response(requests_response, url)
     return response
Exemplo n.º 7
0
def is_similar_url(urla1, urlb2):

    urla = URL(urla1)
    urlb = URL(urlb2)

    hosta = urla.get_host()
    hostb = urlb.get_host()

    porta = urla.get_port()
    portb = urlb.get_port()

    patha = urla.get_path()
    pathb = urlb.get_path()

    if patha.count('/') < 2:
        return True

    patha1 = txt_wrap_by('/', '/', patha)
    pathb1 = txt_wrap_by('/', '/', pathb)

    if hosta == hostb and porta == portb and patha1 == pathb1:
        return True
    else:
        return False
Exemplo n.º 8
0
class WSRequest:
    def __init__(self,
                 url,
                 method='GET',
                 headers=None,
                 cookies=None,
                 refer=None,
                 data=None,
                 user_agent=None,
                 **kwargs):
        if isinstance(url, URL):
            self._url = url
        else:
            self._url = URL(url)

        self._method = method.upper()
        self._headers = {}
        self._cookies = cookies
        self._refer = refer
        self._user_agent = user_agent
        if self._cookies:
            headers.update({"Cookie": self._cookies})
        if self._refer:
            self._headers.update({"Referer": self._refer})
        if self._user_agent:
            self._headers.update({"User-Agent": self._user_agent})
        self._get_date = self._url.get_querystring()

        if data:
            self._post_data = data

    def get_get_param(self):
        " ' "
        " ' "

        return self._get_date

    def get_post_parm(self):

        return self._post_data

    def get_url(self):

        return self._url

    def get_method(self):

        return self._method

    def get_headers(self):

        return self._headers

    def get_cookies(self):

        return self._cookies

    def set_post_data(self, postdata):

        self._post_data = postdata

    def set_get_data(self, getdata):

        self._get_date = getdata

    def set_refer(self, refer):

        self._refer = refer

    def set_cookies(self, cookies):

        self._cookies = cookies

    def __str__(self):

        result_string = self._method

        result_string += " " + self._url.url_string + " HTTP/1.1\r\n"

        headers = copy.deepcopy(self._headers)  #fix me this line

        headers.update({"Host": self._url.get_host()})

        for key, value in headers.items():
            result_string += key + ": " + value
            result_string += "\r\n"
        result_string += "\r\n"

        if self._method == "POST":
            result_string += str(self._post_data)

        result_string = result_string.encode("utf-8")

        return result_string

    def __repr__(self):

        vals = {
            'method': self.get_method(),
            'url': str(self.get_url()),
            'id': self.get_id()
        }

        return '<Request | %(method)s | %(url)s + %(id)s>' % vals