示例#1
0
    def preprocess_link(self, referrer, url):
        # Modify and filter URLs before crawling
        if not url:
            return None

        fields = urlsplit(urljoin(
            referrer, url))._asdict()  # convert to absolute URLs and split
        fields['path'] = re.sub(r'/$', '',
                                fields['path'])  # remove trailing "/"
        fields['fragment'] = ''  # remove targets within a page
        fields = SplitResult(**fields)

        if fields.scheme == 'http':
            httpurl = newurl = fields.geturl()
            httpsurl = httpurl.replace('http:', 'https:', 1)
        elif fields.scheme == 'https':
            httpsurl = newurl = fields.geturl()
            httpurl = httpsurl.replace('https:', 'http:', 1)
        else:
            # Filter the URL without 'http' or 'https'
            return None

        if httpurl not in self.url_set and httpsurl not in self.url_set:
            # Filter URL that already exists in set
            return newurl
        else:
            return None
    def build_url(url):
        url_result = {UrlParser.QUERY: "", UrlParser.FRAGMENT: ""}
        if not url or UrlParser.SCHEME not in url or not url[UrlParser.SCHEME]:
            raise Exception("UrlParser:build_url", "Url dictionary is empty or missing key values")

        url_result[UrlParser.SCHEME] = url[UrlParser.SCHEME]

        if UrlParser.NETLOC in url and url[UrlParser.NETLOC]:
            if (
                UrlParser.USERNAME in url
                and url[UrlParser.USERNAME]
                and url[UrlParser.USERNAME] in url[UrlParser.NETLOC]
            ):
                url_result[UrlParser.NETLOC] = url[UrlParser.NETLOC]
        if UrlParser.NETLOC not in url_result:
            url_result[UrlParser.NETLOC] = url[UrlParser.HOSTNAME]
            if UrlParser.PORT in url and url[UrlParser.PORT]:
                url_result[UrlParser.NETLOC] += str(url[UrlParser.PORT])
            if UrlParser.USERNAME in url and url[UrlParser.USERNAME]:
                credentials = "{}@".format(url[UrlParser.USERNAME])
                if UrlParser.PASSWORD in url and url[UrlParser.PASSWORD]:
                    credentials = "{}:{}@".format(url[UrlParser.USERNAME], url[UrlParser.PASSWORD])
                url_result[UrlParser.NETLOC] = credentials + url_result[UrlParser.NETLOC]

        url_result[UrlParser.PATH] = url[UrlParser.FILENAME]
        if UrlParser.PATH in url and url[UrlParser.PATH]:
            url_result[UrlParser.PATH] = url[UrlParser.PATH] + "/" + url_result[UrlParser.PATH]
            url_result[UrlParser.PATH] = re.sub("//+", "/", url_result[UrlParser.PATH])

        if UrlParser.QUERY in url and url[UrlParser.QUERY]:
            url_result[UrlParser.QUERY] = url[UrlParser.QUERY]

        result = SplitResult(**url_result)
        return result.geturl()
    def build_url(url):
        url_result = {UrlParser.QUERY: '', UrlParser.FRAGMENT: ''}
        if not url or UrlParser.SCHEME not in url or not url[UrlParser.SCHEME]:
            raise Exception('UrlParser:build_url', 'Url dictionary is empty or missing key values')

        url_result[UrlParser.SCHEME] = url[UrlParser.SCHEME]

        if UrlParser.NETLOC in url and url[UrlParser.NETLOC]:
            if UrlParser.USERNAME in url \
                    and url[UrlParser.USERNAME] \
                    and url[UrlParser.USERNAME] in url[UrlParser.NETLOC]:
                url_result[UrlParser.NETLOC] = url[UrlParser.NETLOC]
        if UrlParser.NETLOC not in url_result:
            url_result[UrlParser.NETLOC] = url[UrlParser.HOSTNAME]
            if UrlParser.PORT in url and url[UrlParser.PORT]:
                url_result[UrlParser.NETLOC] += str(url[UrlParser.PORT])
            if UrlParser.USERNAME in url and url[UrlParser.USERNAME]:
                credentials = '{}@'.format(url[UrlParser.USERNAME])
                if UrlParser.PASSWORD in url and url[UrlParser.PASSWORD]:
                    credentials = '{}:{}@'.format(url[UrlParser.USERNAME], url[UrlParser.PASSWORD])
                url_result[UrlParser.NETLOC] = credentials + url_result[UrlParser.NETLOC]

        url_result[UrlParser.PATH] = url[UrlParser.FILENAME]
        if UrlParser.PATH in url and url[UrlParser.PATH]:
            url_result[UrlParser.PATH] = url[UrlParser.PATH] + '/' + url_result[UrlParser.PATH]
            url_result[UrlParser.PATH] = re.sub('//+', '/', url_result[UrlParser.PATH])

        if UrlParser.QUERY in url and url[UrlParser.QUERY]:
            url_result[UrlParser.QUERY] = url[UrlParser.QUERY]

        result = SplitResult(**url_result)
        return result.geturl()
示例#4
0
def clean_link(url):
    o = urlsplit(url)
    if not o.scheme.lower() in ALLOWED_URL_SCHEMES:
        return None
    o = SplitResult(o.scheme, o.netloc, o.path, o.query, '')
    while o.query and __utm_matcher.search(o.query):
        query = __utm_matcher.sub('', o.query)
        o = SplitResult(o.scheme, o.netloc, o.path, query, '')
    return o.geturl()
示例#5
0
	def _parseurl(self, url):
		ret = urlsplit(url)
		self.username = ret.username
		self.password = ret.password
		if ret.port <> None:
			n = SplitResult(ret.scheme, ret.hostname + ":" + ret.port.__str__(), ret.path, ret.query, ret.fragment)
		else:
			n = SplitResult(ret.scheme, ret.hostname, ret.path, ret.query, ret.fragment)
		self.url = n.geturl()
示例#6
0
def with_port(url_str):
    try:
        port = settings.PORT
    except AttributeError:
        port = None
    if port == 80:
        port = None
    url_split = urlsplit(url_str)
    if port:
        if not url_split.port and url_split.netloc:
            scheme, netloc, url, query, fragment = url_split
            netloc += ":%s" % port
            url_split = SplitResult(scheme, netloc, url, query, fragment)
    return url_split.geturl()
示例#7
0
def with_port(url_str):
    try:
        port = settings.PORT
    except AttributeError:
        port = None
    if port == 80:
        port = None
    url_split = urlsplit(url_str)
    if port:
        if not url_split.port and url_split.netloc:
            scheme, netloc, url, query, fragment = url_split
            netloc += ":%s" % port
            url_split = SplitResult(scheme, netloc, url, query, fragment)
    return url_split.geturl()
示例#8
0
def find_next_indexes(soup):
    '''
    next page for an album index or an album page
    '''
    indexes = soup.findAll('a', 'pix-navi-page')
    urls = []
    if indexes:
        max_p = max([int(tag.string) for tag in indexes if tag.string.isdigit()])
        result = urlsplit(httplib.html_unescape(indexes[0]['href']))
        #i don't want patch urllib.unquote. bug description: http://bugs.python.org/issue1712522
        #quick fix is convert to ascii.
        query_dict = parse_qs(result.query.encode('ascii'))
        for p in range(1, max_p + 1):
            query_dict['p'] = p
            result = SplitResult(result.scheme, result.netloc, result.path,
                                 urlencode(query_dict, doseq=True), result.fragment)
            urls.append(result.geturl())
    return urls
示例#9
0
    def __init__(self, url):
        """
        """
        # urlsplit will parse what it can from the provided string.
        raw = urlsplit(url)

        if not raw.path:
            raise ValueError("Invalid argument for MIB source: %s" % url)

        scheme = raw.scheme
        if not scheme:
            scheme = "file" if not raw.netloc else "http"

        path = raw.path
        if scheme == "file" and not path.startswith("/"):
            path = os.path.abspath("./" + path)

        cooked = SplitResult(scheme, raw.netloc, path, raw.query, raw.fragment)
        self._url = cooked.geturl()
        self._scheme = scheme
        self._path = cooked.path
        self._filename = os.path.split(cooked.path)[-1]
示例#10
0
    def __init__(self, url):
        """
        """
        # urlsplit will parse what it can from the provided string.
        raw = urlsplit(url)

        if not raw.path:
            raise ValueError("Invalid argument for MIB source: %s" % url)

        scheme = raw.scheme
        if not scheme:
            scheme = "file" if not raw.netloc else "http"

        path = raw.path
        if scheme == "file" and not path.startswith("/"):
            path = os.path.abspath("./" + path)

        cooked = SplitResult(scheme, raw.netloc, path, raw.query, raw.fragment)
        self._url = cooked.geturl()
        self._scheme = scheme
        self._path = cooked.path
        self._filename = os.path.split(cooked.path)[-1]