示例#1
0
def validate_links(data):
    widgets = [Bar(), SimpleProgress()]
    pbar = ProgressBar(widgets=widgets, maxval=len(data)).start()
    for i, element in enumerate(data):
        url = element['url']
        if url == '':
            continue
        scheme = urlparse.urlsplit(url).scheme
        host = urlparse.urlsplit(url).netloc
        if scheme in ('http', 'https') and \
            url_status_cache.get(url) is not True:
            try:
                request = head(url, timeout=10)
                # some web sites cannot into head requests
                if request.status_code in (403, 405, 500) or \
                    host in ('mobil.morgenpost.de'):
                    request = get(url)
            except Timeout as e:
                stderr.write('Connection to <%s> timeouted.\n' % url)
                exit(1)
            except ConnectionError as e:
                stderr.write('Connection to <%s> failed.\n' % url)
                stderr.write(str(e) + '\n')
                exit(1)
            if request.ok:
                url_status_cache.set(url, request.ok)
            else:
                stderr.write('<%s> is unreachable.\n' % url)
                exit(1)
        pbar.update(i + 1)
示例#2
0
def build_Url(url, href):

	if re.search('logout',href) or re.search('action=out',href) or re.search('action=logoff', href) or re.search('action=delete',href) or re.search('UserLogout',href) or re.search('osCsid', href) or re.search('file_manager.php',href) or href=="http://localhost":#make exclusion list
		return ''
	
	parsed = urlparse.urlsplit(href)
	app=''

	if parsed[1] == urlparse.urlsplit(url)[1]:
		app=href

	else:
		if len(parsed[1]) == 0 and (len(parsed[2]) != 0 or len(parsed[3])!=0):
			domain = urlparse.urlsplit(url)[1]
			if re.match('/', parsed[2]):
				app = 'http://' + domain + parsed[2]
				if parsed[3]!='':
					app += '?'+parsed[3]
			else:
				try:
					app = 'http://' + domain + re.findall('(.*\/)[^\/]*', urlparse.urlsplit(url)[2])[0] + parsed[2]
				except IndexError:
					app = 'http://' + domain + parsed[2]
				if parsed[3]!='':
					app += '?'+parsed[3]

	return app
示例#3
0
def validate_links(data):
    widgets = [Bar(), SimpleProgress()]
    pbar = ProgressBar(widgets=widgets, maxval=len(data)).start()
    for i, element in enumerate(data):
        url = element['url']
        if url == '':
            continue
        scheme = urlparse.urlsplit(url).scheme
        host = urlparse.urlsplit(url).netloc
        if scheme in ('http', 'https') and \
            url_status_cache.get(url) is not True:
            try:
                request = head(url, timeout=10)
                # some web sites cannot into head requests
                if request.status_code in (403, 405, 500) or \
                    host in ('mobil.morgenpost.de'):
                    request = get(url)
            except Timeout as e:
                stderr.write('Connection to <%s> timeouted.\n' % url)
                exit(1)
            except ConnectionError as e:
                stderr.write('Connection to <%s> failed.\n' % url)
                stderr.write(str(e) + '\n')
                exit(1)
            if request.ok:
                url_status_cache.set(url, request.ok)
            else:
                stderr.write('<%s> is unreachable.\n' % url)
                exit(1)
        pbar.update(i+1)
示例#4
0
    def _send_header(self, header_pieces, headers, body, is_request):

        if not self.headers_prepared:
            body_length = len(body)
            had_length = False
            had_host = False
            if is_request:
                resource = header_pieces[1]
                splitted = urlparse.urlsplit(resource)
                url = splitted.path
                if splitted.query:
                    url += '?' + splitted.query
                header_line = '%s %s HTTP/%s\r\n' % (header_pieces[0], url, header_pieces[2])
            else:
                header_line = 'HTTP/%s %s %s\r\n' % header_pieces

            io_request = StringIO()
            io_request.write(header_line)
            for name, value in headers.iteritems():
                if name == 'content-length':
                    io_request.write('%s: %s\r\n' % (name.title(), body_length))
                    had_length = True
                else:
                    io_request.write('%s: %s\r\n' % (name.title(), value))
                if name == 'host':
                    had_host = True

            if not had_length and body_length > 0:
                io_request.write('%s: %s\r\n' % ('Content-Length', body_length))

            if not had_host and is_request:
                splitted = urlparse.urlsplit(resource)
                io_request.write('%s: %s\r\n' % ('Host', splitted.hostname))

            io_request.write('\r\n')
            self.buffer = io_request.getvalue()
            io_request.close()
            self.headers_prepared = True
            self.to_write = len(self.buffer)
            self.written = 0

        if not self.headers_sent:
            while self.to_write > 0:
                written = self.csock.send(self.buffer[self.written:])
                self.written += written
                self.to_write -= written

            self.headers_sent = True
示例#5
0
文件: crawler.py 项目: seraphln/wheel
def download_content_list(detail_url, headers, timeout):
    """
    sample url: http://weixin.sogou.com/gzhjs?openid=oIWsFt86NKeSGd_BQKp1GcDkYpv0&ext=D4y5Z3wUwj5uk6W7Yk9BqC3LAaFqirWHT5QFje14y0dip_leVhZF6qjo9Mm_UUVg&cb=sogou.weixin_gzhcb&page=1&gzhArtKeyWord=&tsn=0&t=1459425446419&_=1459425446169

    其中openid是固定的
    ext也是固定的
    cb=sogou.weixin_gzhcb这个也是固定的
    唯一变化的就是这个t以及_这2个字段,看上去是打开这个页面的时间戳
    """
    global start_flag
    total_records = 0
    context_lst = []
    _t = start_flag 
    now = int(time.time() * 1000)
    url_netloc = urlparse.urlsplit(detail_url)
    cur_url = 'http://%s/gzhjs?%s' % (url_netloc.netloc, url_netloc.query)
    params = "cb=sogou.weixin_gzhcb&page=%s&gzhArtKeyWord=&tsn=0&t=%s&_=%s"
    query_url = cur_url + '&' + params

    for i in range(1, 11):
        target_url = query_url % (i, now, _t)
        print target_url
        resp = download_page(target_url, headers, timeout=DEFAULT_TIMEOUT)
        strip_text = resp.text.replace('sogou.weixin_gzhcb(', '')
        strip_text = strip_text[:len(strip_text)-1]
        context_lst.extend(json.loads(strip_text).get('items', []))
        if not total_records:
            total_records = json.loads(strip_text).get('totalItems', 0)
        _t = _t + 1
        time.sleep(2)

    return context_lst
 def version_matcher(self, url):
    	fname = os.path.basename(urlparse.urlsplit(url).path)
     version_match = re.search(r"([0-9]{2}.[0-9]{0,2}.[0-9]{0,2})", fname)
     if version_match == None:
         raise ProcessorError("Something went wrong matching FMP update to full version.")
     else:
         return version_match.group(1)
示例#7
0
    def _do_request(self, request_id, parameters={}):
        """
        """
        if request_id is None:
            # Generate a new request identifier using the class' default generator
            request_id = self.idgenerator.id()
        
        req_params = dict(parameters)
        req_params.update(dict(
            partner = self.partner,
            vendor = self.vendor,
            user = self.username,
            pwd = self.password,            
        ))
        
        parmlist = self._build_parmlist(req_params)
        
        headers = {
            'Host': urlparse.urlsplit(self.url_base)[1],
            'X-VPS-REQUEST-ID': str(request_id),
            'X-VPS-CLIENT-TIMEOUT': str(self.timeout), # Doc says to do this
            'X-VPS-Timeout': str(self.timeout), # Example says to do this
            'X-VPS-INTEGRATION-PRODUCT': self.CLIENT_IDENTIFIER,
            'X-VPS-INTEGRATION-VERSION': self.API_VERSION,
            'X-VPS-VIT-OS-NAME': sys.platform,
            'Connection': 'close',
            'Content-Type': 'text/namevalue',            
            }

        self.log.debug(u'Request Headers: %s' % headers)
            
        try_count = 0
        results = None
        while (results is None and try_count < self.MAX_RETRY_COUNT):
            try:
                try_count += 1
                request = Request(
                    url = self.url_base, 
                    data = parmlist.encode('utf-8'), 
                    headers = headers)
                    
                response = urlopen(request)
                result_parmlist = response.read()
                response.close()
                
                self.log.debug(
                    u'Result text: %s' % result_parmlist.decode('utf-8')
                )
                
                results = self._parse_parmlist(result_parmlist)
            except Exception, e:
                
                if try_count < self.MAX_RETRY_COUNT:
                    self.log.warn(
                        u'API request attempt %s of %s failed - %%s' % (
                            try_count, self.MAX_RETRY_COUNT), e
                        )
                else:
                    self.log.exception(u'Final API request failed - %s', e)
                    raise e
示例#8
0
def victimise(victim, uri):
    raw_url = victim + uri
    scheme, netloc, path, raw_query, fragment = urlparse.urlsplit(raw_url)
    query = urlparse.parse_qs(raw_query)
    url = urlparse.urlunsplit((scheme, netloc, path, urlencode(query, True), fragment))
    print url
    http_client.fetch(url, fetch, use_gzip=False)
示例#9
0
def login_proceed(request):
    """View that handles the successful login.
    """

    template_name = '_user_login.html'

    # Check if the request came from logout page, if so set
    # authentication to redirect to home page
    referer_path = urlparse.urlsplit(request.META['HTTP_REFERER'])[2]
    if referer_path == reverse('auth_logout'):
      response = {
        'authentication': 'success',
        'redirect': reverse('home_page'),
        }
    elif referer_path == reverse('registration_activation_complete'):
      response = {
        'authentication': 'success',
        'redirect': reverse('view_profile'),
        }
    else:
        response = {
          'authentication': 'success',
          'markup': loader.render_to_string(template_name,
                                            RequestContext(request, {}))
        }

    json_response = json.dumps(response)
    return http.HttpResponse(json_response)
def download_metadata(target_directory):
    """
    Downloads XML files for DOIs on stdin into given directory.
    """
    stderr.write('Input DOIs, delimited by whitespace: ')
    dois = stdin.read().split()
    if len(dois) == 0:
        raise RuntimeError, 'No DOIs found.'

    stderr.write('Getting PubMed Central IDs for given DOIs … ')
    pmcids = _get_pmcids_from_dois(dois)
    if len(pmcids) == 0:
        raise RuntimeError, 'No PubMed Central IDs for given DOIs found.'
    stderr.write('found: %s\n' % ', '.join(pmcids))

    url = _get_query_url_from_pmcids(pmcids)
    yield { 'url': url, 'completed': 0, 'total': 1 }

    url_path = urlparse.urlsplit(url).path
    local_filename = path.join(target_directory, \
        url_path.split('/')[-1])
    with open(local_filename, 'wb') as local_file:
        content = _get_file_from_pmcids(pmcids)
        local_file.write(content.read())
        yield { 'url': url, 'completed': 1, 'total': 1 }
示例#11
0
    def validateURL (cls, full_url, video_item=True):
        """Make sure the url passed is in a valid form and return a video parser object"""
        if not isinstance (full_url, str):
            raise TypeError ("Argument must be a string")

        spliturl = urlparse.urlsplit (full_url)
        hostname = spliturl.hostname
#        print len (cls.parsers.keys ())

        if not hostname:
            return None
        elif hostname.startswith ("www."):
            hostname = hostname.lstrip ("www.")

        if hostname not in cls.parsers:
            return None

        page_parser = cls.parsers[hostname].checkURL (full_url)
        if page_parser and video_item:
            youtube_video = VideoItem (page_parser)
        elif page_parser:
            youtube_video = page_parser
        else:
            youtube_video = None

        return youtube_video
示例#12
0
def open_url(url, **kwargs):
    """
    open_url(url, **kwargs) - open url and return file descriptor

    url - local file path or full url path. Allowed protocols are local file
    path, file, http and ftp

    kwargs - additional attributes according to protocol, 'mode' for local
    path and file protocol, 'proxy', 'data' and 'timeout' (Python >= 2.6)
    for http and ftp protocols

    Examples:

    open_url('/home/praetorian/secret.txt')
    open_url('file:///home/praetorian/secret.txt', mode='r')
    open_url('http://domain.tld/secret.txt', proxy='172:16:1:100:8000')
    open_url('ftp://domain.tld/secret.txt')
    """
    bits = urlparse.urlsplit(url)
    attrs = kwargs

    if bits.scheme in ('', 'file'):
        url = bits.netloc + bits.path
        opener = open
    elif bits.scheme in ('http', 'ftp'):
        handlers = []
        if 'proxy' in attrs:
            handlers.append(ProxyHandler({bits.scheme: attrs.pop('proxy')}))

        url =  bits.geturl()
        opener = build_opener(*handlers).open
    else:
        raise URLError("Unsupported protocol '%s'" % bits.scheme)

    return opener(url, **attrs)
示例#13
0
    def test_site(self, browser, url):
        '''
        Tests the given url using given browser object for login fields. Attempts to find real login url if not successful.

        Args:
                [browser] (Obj)     The Browser object.
                [url] (str)         The url to test.
        '''
        login = False
        browser.visit(url)

        parsed_url = urlparse.urlsplit(url)
        if (parsed_url.path == "/" or not parsed_url.path)\
           and not parsed_url.query:
            # check to see if login elements present on current page
            # if not, proceed to find real login url
            login_url = self.get_login_url(browser)
            if login_url and login_url.rstrip("/") != browser.url.rstrip("/"):
                browser.visit(login_url)
                # URL was clicked on, !ret
            else:
                print("XXXX   failed to find login link for: ", browser.url)
                return False
        self.browser_url = browser.url
        return True

        return False
示例#14
0
def download_metadata(target_directory):
    """
    Downloads XML files for DOIs on stdin into given directory.
    """
    stderr.write('Input DOIs, delimited by whitespace: ')
    dois = stdin.read().split()
    if len(dois) == 0:
        raise RuntimeError, 'No DOIs found.'

    stderr.write('Getting PubMed Central IDs for given DOIs … ')
    pmcids = _get_pmcids_from_dois(dois)
    if len(pmcids) == 0:
        raise RuntimeError, 'No PubMed Central IDs for given DOIs found.'
    stderr.write('found: %s\n' % ', '.join(pmcids))

    url = _get_query_url_from_pmcids(pmcids)
    yield {'url': url, 'completed': 0, 'total': 1}

    url_path = urlparse.urlsplit(url).path
    local_filename = path.join(target_directory, \
        url_path.split('/')[-1])
    with open(local_filename, 'wb') as local_file:
        content = _get_file_from_pmcids(pmcids)
        local_file.write(content.read())
        yield {'url': url, 'completed': 1, 'total': 1}
示例#15
0
def download_metadata(target_directory):
    """
    Downloads XML files for PMCIDs on stdin into given directory.
    """
    stderr.write('Input PMCIDs, delimited by whitespace: ')
    pmcids = stdin.read().split()
    if len(pmcids) == 0:
        raise RuntimeError, 'No PMCIDs found.'

    # delete files from earlier invocations
    listing = listdir(target_directory)
    for filename in listing:
        file_path = path.join(target_directory, filename)
        stderr.write("Removing “%s” … " % file_path)
        remove(file_path)
        stderr.write("done.\n")

    # chunk function by nosklo, source:
    # <http://stackoverflow.com/questions/434287/what-is-the-most-pythonic-way-to-iterate-over-a-list-in-chunks#answer-434328>
    def chunker(seq, size):
        return (seq[pos:pos + size] for pos in xrange(0, len(seq), size))

    for i, chunk in enumerate(chunker(pmcids, 365)):
        url = _get_query_url_from_pmcids(chunk)
        yield { 'url': url, 'completed': 0, 'total': 1 }

        url_path = urlparse.urlsplit(url).path
        local_filename = path.join(target_directory, \
            url_path.split('/')[-1] + str(i))
        with open(local_filename, 'wb') as local_file:
            content = _get_file_from_pmcids(chunk)
            local_file.write(content.read())
            yield { 'url': url, 'completed': 1, 'total': 1 }
示例#16
0
    def pastebin(self, source, api_key=None):
        """
        Dump file/data to Pastebin

        `Required`
        :param str source:      data or filename

        `Optional`
        :param str api_key:     Pastebin api_dev_key

        Returns URL of pastebin document as a string

        """
        try:
            if api_key:
                info = {
                    'api_option': 'paste',
                    'api_paste_code': normalize(source),
                    'api_dev_key': api_key
                }
                paste = globals()['post'](
                    'https://pastebin.com/api/api_post.php', data=info)
                parts = urlparse.urlsplit(paste)
                return urlparse.urlunsplit(
                    (parts.scheme, parts.netloc, '/raw' + parts.path,
                     parts.query,
                     parts.fragment)) if paste.startswith('http') else paste
            else:
                return "{} error: no pastebin API key".format(
                    self.pastebin.func_name)
        except Exception as e:
            return '{} error: {}'.format(self.pastebin.func_name, str(e))
示例#17
0
    def submit(self, opener, res):
        """submit WAYF form with IDP

        :param opener: the urllib2 opener
        :param data: the form data as a dictionary
        :param res: the response object

        """
        log.info("Submitting form to wayf")
        # Set IDP to correct IDP
        wayf_data = {}
        idp = self.idp
        data = self.data
        idps = {}
        for d in data["user_idp"]:
            if isinstance(data["user_idp"][d], dict):
                idps.update(data["user_idp"][d])
        if not idp.get_idp() in idps:
            raise WAYFException("Can't find IdP '%s' in WAYF's IdP list" % idp)
        wayf_data["user_idp"] = idps[idp.get_idp()]
        wayf_data["Select"] = "Select"
        if data["form"]["action"].startswith("?"):
            urlsp = urlparse.urlsplit(res.url)
            urlsp = urlparse.urlunsplit((urlsp[0], urlsp[1], urlsp[2], "", ""))
            url = res.url + data["form"]["action"]
        else:
            url = urlparse.urljoin(res.url, data["form"]["action"])
        data = urllib.urlencode(wayf_data)
        request = Request(url, data)
        log.debug("POST: %s" % request.get_full_url())
        response = opener.open(request)
        return request, response
示例#18
0
def login_proceed(request):
    """View that handles the successful login.
    """

    template_name = '_user_login.html'

    # Check if the request came from logout page, if so set
    # authentication to redirect to home page
    referer_path = urlparse.urlsplit(request.META['HTTP_REFERER'])[2]
    if referer_path == reverse('auth_logout'):
        response = {
            'authentication': 'success',
            'redirect': reverse('home_page'),
        }
    elif referer_path == reverse('registration_activation_complete'):
        response = {
            'authentication': 'success',
            'redirect': reverse('view_profile'),
        }
    else:
        response = {
            'authentication':
            'success',
            'markup':
            loader.render_to_string(template_name, RequestContext(request, {}))
        }

    json_response = json.dumps(response)
    return http.HttpResponse(json_response)
示例#19
0
def set_language_ex(request):
    next = request.POST.get('next', request.GET.get('next'))
    if not is_safe_url(url=next, host=request.get_host()):
        next = request.META.get('HTTP_REFERER')
        if not is_safe_url(url=next, host=request.get_host()):
            next = '/'

    # remove lang from query
    scheme, netloc, path, query, fragment = urlparse.urlsplit(next)
    parsed_query = urlparse.parse_qsl(query)
    altered = False
    for k, v in parsed_query[:]:
        if LANG_GET_KEY == k:
            parsed_query.remove((k, v))
            altered = True
    if altered:
        query = urllib.urlencode(parsed_query)
        next = urlparse.urlunsplit((scheme, netloc, path, query, fragment))

    response = http.HttpResponseRedirect(next)
    if request.method == 'POST':
        lang_code = request.POST.get('language', None)
        if lang_code and check_for_language(lang_code):
            if hasattr(request, 'session'):
                request.session[LANGUAGE_SESSION_KEY] = lang_code
            else:
                response.set_cookie(settings.LANGUAGE_COOKIE_NAME, lang_code,
                                    max_age=settings.LANGUAGE_COOKIE_AGE,
                                    path=settings.LANGUAGE_COOKIE_PATH,
                                    domain=settings.LANGUAGE_COOKIE_DOMAIN)
    return response
示例#20
0
def fetch(request):
    target = request.GET.get('url', None)
    if not target:
        response = HttpResponseBadRequest()
        return response
    to = request.GET.get('to', 'en')
    print('Translate %s to %s' % (target, to))
    page = ''
    if not target.startswith('http'):
        target = 'http://' + target
    try:
        page = _fetch_link(target)
    except Exception:
        return HttpResponseServerError('Fetch %s failed' % target)
    parts = list(urlparse.urlsplit(target))
    # clean path fragement and params
    parts[2] = '/'
    parts[3] = ''
    parts[4] = ''
    base = urlparse.urlunsplit(parts)
    try:
        translated = _translate(page, to, 'zh-CHS', base)
    except Exception as e:
        return HttpResponseServerError('Translate failed: %s' % e)
    return HttpResponse(translated)
 def version_matcher(self, url):
     fname = os.path.basename(urlparse.urlsplit(url).path)
     version_match = re.search(r"([0-9]{2}.[0-9]{0,2}.[0-9]{0,2})", fname)
     if version_match is None:
         raise ProcessorError(
             "Something went wrong matching FMP update to full version.")
     else:
         return version_match.group(1)
示例#22
0
文件: tests.py 项目: ixth/blombum
 def testIndexRedirect(self):
     if settings.SET_URL_ROOT_HANDLER:
         response = self.client.get('/')
         self.assertEquals(response.status_code, 302)
         # Documentation says that we must get response.headers, but
         # instead we have HttpResponseRedirect object here
         self.assertEquals(urlparse.urlsplit(response['Location'])[2],
                           '/' + settings.BLOG_URLCONF_ROOT)
示例#23
0
    def make_requests_from_url(self, url):

        kw = self.macro.query(url)
        us = urlparse.urlsplit(url)
        qstr = dict(urlparse.parse_qsl(us.query))
        base = urlparse.urlunsplit(us._replace(query=''))
        meta = {'keyword':kw}
        return FormRequest(base, formdata=qstr, method=self.start_method, headers=self.headers, cookies=self.cookies, dont_filter=True, meta=meta)
示例#24
0
    def _do_request(self, request_id, parameters={}):
        """
        """
        if request_id is None:
            # Generate a new request identifier using the class' default generator
            request_id = self.idgenerator.id()

        req_params = dict(parameters)
        req_params.update(
            dict(
                partner=self.partner,
                vendor=self.vendor,
                user=self.username,
                pwd=self.password,
            ))

        parmlist = self._build_parmlist(req_params)

        headers = {
            'Host': urlparse.urlsplit(self.url_base)[1],
            'X-VPS-REQUEST-ID': str(request_id),
            'X-VPS-CLIENT-TIMEOUT': str(self.timeout),  # Doc says to do this
            'X-VPS-Timeout': str(self.timeout),  # Example says to do this
            'X-VPS-INTEGRATION-PRODUCT': self.CLIENT_IDENTIFIER,
            'X-VPS-INTEGRATION-VERSION': self.API_VERSION,
            'X-VPS-VIT-OS-NAME': sys.platform,
            'Connection': 'close',
            'Content-Type': 'text/namevalue',
        }

        self.log.debug(u'Request Headers: %s' % headers)

        try_count = 0
        results = None
        while (results is None and try_count < self.MAX_RETRY_COUNT):
            try:
                try_count += 1
                request = Request(url=self.url_base,
                                  data=parmlist.encode('utf-8'),
                                  headers=headers)

                response = urlopen(request)
                result_parmlist = response.read()
                response.close()

                self.log.debug(u'Result text: %s' %
                               result_parmlist.decode('utf-8'))

                results = self._parse_parmlist(result_parmlist)
            except Exception, e:

                if try_count < self.MAX_RETRY_COUNT:
                    self.log.warn(
                        u'API request attempt %s of %s failed - %%s' %
                        (try_count, self.MAX_RETRY_COUNT), e)
                else:
                    self.log.exception(u'Final API request failed - %s', e)
                    raise e
示例#25
0
文件: utils.py 项目: BlankRain/webbot
def generate_urls(obj, macro):
    try:
        if type(obj)==list:
            for url in obj:
                yield macro.expand(url)

        elif type(obj)==dict:
            base = macro.expand(obj['base'].encode('utf-8'))
            us = urlparse.urlsplit(base)
            qstr = dict(urlparse.parse_qsl(us.query))
            qstr.update(obj.get('qstr', {}))
            base = urlparse.urlunsplit(us._replace(query=''))

            for k,v in qstr.iteritems():
                if type(v)==dict and type(v['val'])==unicode:
                    v = v['val'].encode(v.get('enc', 'utf-8'), errors='ignore')
                qstr[k] = macro.expand(v)

            if 'keywords' in obj:
                kw_obj = obj['keywords']

                sub = kw_obj.get('sub')
                if sub:
                    frm = sub.get('from')
                    to = sub.get('to')
                    sub = functools.partial(re.sub, frm, to)
                else:
                    sub = lambda x:x

                for kw in load_keywords(kw_obj):

                    if kw==MAGIC:
                        yield 'http://0.0.0.0'
                        continue

                    key = kw_obj['name'].encode('utf-8')
                    val = kw
                    col = kw_obj.get('col', 0)
                    sep = kw_obj.get('sep')
                    if col>0:
                        val = val.split(sep)[col-1]
                    val = sub(val)
                    if kw_obj.get('query', True):
                        qstr.update({key:val})
                        url = base+'?'+urlencode(qstr)
                    else:
                        val = val.encode(kw_obj.get('enc', 'utf-8'), errors='ignore') if type(val)==unicode else str(val)
                        url = base.replace(key, val)+'?'+urlencode(qstr)
                    macro.update({'sep':sep})
                    macro.bind(url, kw)
                    yield url
            else:
                url = base+'?'+urlencode(qstr)
                yield url

    except Exception as ex:
        log.msg(u'cannot generate urls: {}'.format(ex), level=log.ERROR)
        raise CloseSpider()
def urlStringToServers(urlString):
    " convert |-sep list of urls to list of hostnames "
    servers = set()
    urls = urlString.split("|")
    for url in urls:
        parts = urlparse.urlsplit(url)
        server = parts[1]
        server = server.replace("www.", "").strip()
        if server != "" and not "pubmedcentral" in server:
            servers.add(server)
    return servers
示例#27
0
def urlStringToServers(urlString):
    " convert |-sep list of urls to list of hostnames "
    servers = set()
    urls = urlString.split("|")
    for url in urls:
        parts = urlparse.urlsplit(url)
        server = parts[1]
        server = server.replace("www.", "").strip()
        if server!="" and not "pubmedcentral" in server:
            servers.add(server)
    return servers
示例#28
0
        def delete_remote_file():
            remote_filename = os.path.basename(urlparse.urlsplit(remote_url)[2])
            remote_file = os.path.join(cfg_remotepath, remote_filename)

            ssh_retval = subprocess.call(["ssh", "-o", "PasswordAuthentication=no",
                                          "-o", "StrictHostKeyChecking=no",
                                          remotehost, "-f", "rm", "-f", remote_file],
                                          stdout=open(os.devnull),
                                          stderr=open(os.devnull))
            if 0 != ssh_retval:
                print "Failed to delete remote file"
示例#29
0
def getapodlist(url, picpath):
	feed = feedparser.parse(url)
	for item in feed["items"]:
		pic = item["description"]
		parseurl = urlparse.urlsplit(pic)
		outfile = parseurl.parse.split("/")[3]
		picfile = os.path.join(picpath, outfile)
		if os.path.isfile(picfile):
			pass
		else:
			urlretrieve(pic, picfile)
示例#30
0
def _stager(options, **kwargs):
    util.display("\n[>]", color='green', style='bright', end=' ')
    util.display("Stager", color='reset', style='bright')

    assert 'url' in kwargs, "missing keyword argument 'url'"
    assert 'key' in kwargs, "missing keyword argument 'key'"
    assert 'var' in kwargs, "missing keyword argument 'var'"

    if options.encrypt:
        stager = open('core/stagers.py', 'r').read() + generators.main('run', url=kwargs['url'], key=kwargs['key'])
    else:
        stager = open('core/stagers.py', 'r').read() + generators.main('run', url=kwargs['url'])

    if not os.path.isdir('modules/stagers'):
        try:
            os.mkdir('modules/stagers')
        except OSError:
            util.log("Permission denied: unable to make directory './modules/stagers/'")

    if options.compress:
        util.display("\tCompressing stager... ", color='reset', style='normal', end=' ')
        __load__ = threading.Event()
        __spin__ = _spinner(__load__)
        output = generators.compress(stager)
        __load__.set()
        _update(stager, output, task='Compression')
        stager = output

    util.display("\tUploading stager... ", color='reset', style='normal', end=' ')
    __load__ = threading.Event()
    __spin__ = _spinner(__load__)

    if options.pastebin:
        assert options.pastebin, "missing argument 'pastebin' required for option 'pastebin'"
        url = util.pastebin(stager, options.pastebin)
    else:
        dirs = ['modules/stagers','byob/modules/stagers','byob/byob/modules/stagers']
        dirname = '.'
        for d in dirs:
            if os.path.isdir(d):
                dirname = d

        path = os.path.join(os.path.abspath(dirname), kwargs['var'] + '.py' )

        with open(path, 'w') as fp:
            fp.write(stager)

        s = 'http://{}:{}/{}'.format(options.host, int(options.port) + 1, pathname2url(path.replace(os.path.join(os.getcwd(), 'modules'), '')))
        s = urlparse.urlsplit(s)
        url = urlparse.urlunsplit((s.scheme, s.netloc, os.path.normpath(s.path), s.query, s.fragment)).replace('\\','/')

    __load__.set()
    util.display("(hosting stager at: {})".format(url), color='reset', style='dim')
    return url
示例#31
0
def getVideoID(videoURL):
	urlSplit = urlparse.urlsplit(videoURL)
	toParse = str(urlSplit[2])

	try:
		videoID = toParse.split("/")[2]
	except Exception:
		print("[Heavy-R Downloader] Erro! Não foi possível extrair ID do vídeo!")
		print("[Heavy-R Downloader] Verifique se a URL do vídeo não está incompleta e tente novamente.")
		sys.exit(1)

	return videoID
示例#32
0
 def parse_recipe(cls, url):
     maker_dict = {'www.manjulaskitchen.com':ManjulasMaker,
                   'www.101cookbooks.com':OneCookMaker,
                   'www.gourmet.com':GourmetMaker}    
     target_maker = urlparse.urlsplit(url)[1]
     current_maker = maker_dict[target_maker]
     
     #create child and call child's process_url method        
     current_recipe = current_maker(url).process_url()
     
     #passes back to the caller what the child class passes back        
     return current_recipe
示例#33
0
 def _get_resources(self, target=None, base_url=None):
     if sys.version_info[0] < 3:
         from urllib import urlretrieve
         from urllib2 import urlopen, urlparse
         import StringIO
     else:
         from urllib import parse as urlparse
         from urllib.request import urlopen, urlretrieve
         from io import StringIO
     try:
         if not isinstance(target, list):
             raise TypeError(
                 "keyword argument 'target' must be type 'list'")
         if not isinstance(base_url, str):
             raise TypeError(
                 "keyword argument 'base_url' must be type 'str'")
         if not base_url.startswith('http'):
             raise ValueError(
                 "keyword argument 'base_url' must start with http:// or https://"
             )
         log('[*] Searching %s' % base_url)
         path = urlparse.urlsplit(base_url).path
         base = path.strip('/').replace('/', '.')
         names = []
         for line in urlopen(base_url).read().splitlines():
             line = str(line)
             if 'href' in line and '</a>' in line and '__init__.py' not in line:
                 names.append(
                     line.rpartition('</a>')[0].rpartition('>')[2].strip(
                         '/'))
         for n in names:
             name, ext = os.path.splitext(n)
             if ext in ('.py', '.pyc'):
                 module = '.'.join((base, name)) if base else name
                 if module not in target:
                     log("[+] Adding %s" % module)
                     target.append(module)
             elif not len(ext):
                 t = threading.Thread(target=self._get_resources,
                                      kwargs={
                                          'target': target,
                                          'base_url': '/'.join(
                                              (base_url, n))
                                      })
                 t.daemon = True
                 t.start()
             else:
                 resource = '/'.join((path, n))
                 if resource not in target:
                     target.append(resource)
     except Exception as e:
         log("{} error: {}".format(self._get_resources.__name__, str(e)))
	def run(self):
		for oneline in self.reader:
			thumb_hash, URL, priority = oneline.split() + [0]
			prority = int(priority)
			
			one_file = BackupFile()
			one_file.thumb_hash = thumb_hash
			one_file.name = os.path.basename(urlparse.urlsplit(URL).path)
			EXT = URL[-3:].upper()#FLASHAIR seems to only allow three-letter extensions

			if self.ORM.exists(one_file):
				print "file already exists!"
				continue #Nothing has been downloaded yet.
			
			#The thumb_hash must have been unique.

			tmp_filename = os.path.join(self.tmp_dir, thumb_hash + "." + EXT)

			retval = sp.call("curl '%s' > '%s'" % (URL, tmp_filename), shell=True)
			if retval != 0:
				continue #there was a problem. maybe try again later.
			
			
			md5sum = None
			try:
				#TODO: md5 of only the first few MB of large files. (Or the last few, to guard against truncated files.)
				#
				md5out = sp.check_output("md5sum %s" % tmp_filename, shell=True)
				md5sum, other = md5out.split()[:2]
				md5sum = md5sum.strip()
			except:
				pass
			
			one_file.full_hash = md5sum
			
			if self.ORM.exists(one_file):
				try:
					os.remove(tmp_filename)
				except:
					#maybe its alreayd gone?
					pass
				continue
			
			#Does not exist in the database
			#TODO: Set the creation date of the file (read that on input?)
			#TODO: Folders based on creation date
			#TODO: 
			
			new_name = os.path.join(self.base_dir, one_file.name)
			shutil.move(tmp_filename, new_name)
			
			self.ORM.store(one_file)
示例#35
0
 def download_modules(self):
     mkdir_p(self.name)
     primary_urlobj = urlparse.urlsplit(primary_url)
     full_category_url = primary_url + self.url
     cat_soup = bs(urlopen(full_category_url))
     for row in cat_soup.find('table').find_all('tr'):
         try:
             mod = ModuleEntry(self, row, '%s://%s' % (primary_urlobj.scheme, primary_urlobj.netloc))
             mod.start()
             self.logger.debug('category.mod: %s', mod)
             self.mods.append(mod)
         except FlowException:
             pass
示例#36
0
    def replace_type_chunk(self, offset):
        o_loc = Locations.objects.order_by('id')[offset:offset + self.step_limit]
        loc_type = dict(APP_CONTENTS_LOC_TYPE)

        for item in o_loc:
            pattern = urlparse.urlsplit(item.url_view).netloc

            for i in loc_type.keys():
                if i in pattern.replace('.',''):
                    item.type = i
                    item.save()

                    break
示例#37
0
 def validator(form, value):
     parts = urlparse.urlsplit(value)
     if parts.scheme not in ['http', 'https']:
         raise ValidationError(lazy_gettext(u'URLs must be of type '\
                                            u'http or https.'))
     elif parts.fragment:
         raise ValidationError(lazy_gettext(u'URLs may not have a '\
                                            u'#reference part.'))
     elif parts.netloc.find('@') != -1:
         raise ValidationError(lazy_gettext(u'URLs should not be specified '\
                                            u'with username and password.'))
     elif parts.query:
         raise ValidationError(lazy_gettext(u'URLs may not have a ?query.'))
示例#38
0
def asciify_url(url, force_quote=False):
    r"""Attempts to make a unicode url usuable with ``urllib/urllib2``.

    More specifically, it attempts to convert the unicode object ``url``,
    which is meant to represent a IRI, to an unicode object that,
    containing only ASCII characters, is a valid URI. This involves:

        * IDNA/Puny-encoding the domain name.
        * UTF8-quoting the path and querystring parts.

    See also RFC 3987.
    """
    assert type(url) == unicode

    parts = urlparse.urlsplit(url)
    if not parts.scheme or not parts.netloc:
        # apparently not an url
        return url

    # idna-encode domain
    hostname = parts.hostname.encode('idna')

    # UTF8-quote the other parts. We check each part individually if
    # if needs to be quoted - that should catch some additional user
    # errors, say for example an umlaut in the username even though
    # the path *is* already quoted.
    def quote(s, safe):
        s = s or ''
        # Triggers on non-ascii characters - another option would be:
        #     urllib.quote(s.replace('%', '')) != s.replace('%', '')
        # which would trigger on all %-characters, e.g. "&".
        if s.encode('ascii', 'replace') != s or force_quote:
            return urllib.quote(s.encode('utf8'), safe=safe)
        return s

    username = quote(parts.username, '')
    password = quote(parts.password, safe='')
    path = quote(parts.path, safe='/')
    query = quote(parts.query, safe='&=')

    # put everything back together
    netloc = hostname
    if username or password:
        netloc = '@' + netloc
        if password:
            netloc = ':' + password + netloc
        netloc = username + netloc
    if parts.port:
        netloc += ':' + str(parts.port)
    return urlparse.urlunsplit(
        [parts.scheme, netloc, path, query, parts.fragment])
    def replace_type_chunk(self, offset):
        o_loc = Locations.objects.order_by('id')[offset:offset +
                                                 self.step_limit]
        loc_type = dict(APP_CONTENTS_LOC_TYPE)

        for item in o_loc:
            pattern = urlparse.urlsplit(item.url_view).netloc

            for i in loc_type.keys():
                if i in pattern.replace('.', ''):
                    item.type = i
                    item.save()

                    break
示例#40
0
def victimise(victim, request):
    try:
        lines = request.split('\n')
        uri = lines[0].split(' ')[1]
        body = lines[-1]

        raw_url = victim + uri
        scheme, netloc, path, raw_query, fragment = urlparse.urlsplit(raw_url)
        query = urlparse.parse_qs(raw_query)
        url = urlparse.urlunsplit((scheme, netloc, path, urlencode(query, True), fragment))
        if body:
            http_client.fetch(url, fetch, method="POST", body=body, use_gzip=False)
    except:
        pass
示例#41
0
    def parse_recipe(cls, url):
        maker_dict = {
            'www.manjulaskitchen.com': ManjulasMaker,
            'www.101cookbooks.com': OneCookMaker,
            'www.gourmet.com': GourmetMaker
        }
        target_maker = urlparse.urlsplit(url)[1]
        current_maker = maker_dict[target_maker]

        #create child and call child's process_url method
        current_recipe = current_maker(url).process_url()

        #passes back to the caller what the child class passes back
        return current_recipe
示例#42
0
def asciify_url(url, force_quote=False):
    r"""Attempts to make a unicode url usuable with ``urllib/urllib2``.

    More specifically, it attempts to convert the unicode object ``url``,
    which is meant to represent a IRI, to an unicode object that,
    containing only ASCII characters, is a valid URI. This involves:

        * IDNA/Puny-encoding the domain name.
        * UTF8-quoting the path and querystring parts.

    See also RFC 3987.
    """
    assert type(url) == unicode

    parts = urlparse.urlsplit(url)
    if not parts.scheme or not parts.netloc:
        # apparently not an url
        return url

    # idna-encode domain
    hostname = parts.hostname.encode('idna')

    # UTF8-quote the other parts. We check each part individually if
    # if needs to be quoted - that should catch some additional user
    # errors, say for example an umlaut in the username even though
    # the path *is* already quoted.
    def quote(s, safe):
        s = s or ''
        # Triggers on non-ascii characters - another option would be:
        #     urllib.quote(s.replace('%', '')) != s.replace('%', '')
        # which would trigger on all %-characters, e.g. "&".
        if s.encode('ascii', 'replace') != s or force_quote:
            return urllib.quote(s.encode('utf8'), safe=safe)
        return s
    username = quote(parts.username, '')
    password = quote(parts.password, safe='')
    path = quote(parts.path, safe='/')
    query = quote(parts.query, safe='&=')

    # put everything back together
    netloc = hostname
    if username or password:
        netloc = '@' + netloc
        if password:
            netloc = ':' + password + netloc
        netloc = username + netloc
    if parts.port:
        netloc += ':' + str(parts.port)
    return urlparse.urlunsplit([
        parts.scheme, netloc, path, query, parts.fragment])
示例#43
0
def getBoardAndThread(threadURL):
	if threadURL == None:
		threadURL = raw_input("[Check'em] Insert the thread's URL: ")
		print("")

	url_split = urlparse.urlsplit(threadURL)

	if str(url_split[1]) != "boards.4chan.org" and str(url_split[1]) != "4chan.org":
		print("[Check'em] Very funny retard, wrong website!")
		sys.exit(1)
	else:
		toParse = str(url_split[2])
		board = toParse.split("/")[1]
		threadID = toParse.split("/")[3]
		
		return "https://a.4cdn.org/%s/thread/%s.json" % (board, threadID)
 def main(self):
     try:
         url = ""
         defined_version = self.env.get("version")
         update = self.getLatestFilemakerProAdvancedInstaller(defined_version)
         version_str = self.env.get("major_version")
         update["version"] = self.version_matcher(update["url"])
         url = update["url"]
         self.output("URL found '%s'" % url, verbose_level=2)
         self.env["version"] = update["version"]
         self.env["url"] = url
         self.env["package_name"] = update["name"]
         self.env["package_file"] = os.path.basename(urlparse.urlsplit(url).path)
     except BaseException as err:
         # handle unexpected errors here
         raise ProcessorError(err)
示例#45
0
 def from_native(self, value):
     if type(value) is dict:
         encoding = value.get('encoding', 'base64')
         filename = value.get('filename', '')
         content = value['content']
         if encoding == 'base64':
             content = base64.b64decode(content)
         elif encoding == 'url':
             try:
                 res = requests.get(content, stream=True)
             except (MissingSchema, InvalidSchema, InvalidURL), e:
                 raise ValidationError(smart_unicode(e))
             if status.is_success(res.status_code):
                 if not filename:
                     filename = basename(urlparse.urlsplit(content)[2])
                 content = res.content
         value = ContentFile(content, name=filename)
def main():
	main_directory()

	print("=====================================")
	print("Glot.io Code Downloader - Version %s" % (version))
	print("=====================================\n")

	url = raw_input("Insert the Snippet's URL: ")

	url_split = urlparse.urlsplit(url)
	if str(url_split[1]) != "glot.io":
		print("\n[Glot.io Code Downloader] Error! Wrong website! Check the URL and try again!")
		sys.exit(1)
	else:
		title, title_dir, filenames = code_information(url)
		code_directory(title_dir)
		get_codes(url, filenames)
 def main(self):
     try:
         url = ""
         defined_version = self.env.get("version")
         update = self.getLatestFilemakerProAdvancedInstaller(
             defined_version)
         version_str = self.env.get("major_version")
         update["version"] = self.version_matcher(update["url"])
         url = update["url"]
         self.output("URL found '%s'" % url, verbose_level=2)
         self.env["version"] = update["version"]
         self.env["url"] = url
         self.env["package_name"] = update["name"]
         self.env["package_file"] = os.path.basename(
             urlparse.urlsplit(url).path)
     except Exception as err:
         # handle unexpected errors here
         raise ProcessorError(err)
示例#48
0
文件: tests.py 项目: agateau/pydici
    def test_create_lead(self):
        self.client.login(username=TEST_USERNAME, password=TEST_PASSWORD)
        lead = create_lead()
        self.failUnlessEqual(lead.staffing.count(), 0)
        self.failUnlessEqual(lead.staffing_list(), ", (JCF)")
        lead.staffing.add(Consultant.objects.get(pk=1))
        self.failUnlessEqual(lead.staffing.count(), 1)
        self.failUnlessEqual(len(lead.update_date_strf()), 14)
        self.failUnlessEqual(lead.staffing_list(), "SRE, (JCF)")
        self.failUnlessEqual(lead.short_description(), "A wonderfull lead th...")
        self.failUnlessEqual(urlresolvers.reverse("leads.views.detail", args=[4]), PREFIX + "/leads/4/")

        url = "".join(urlparse.urlsplit(urlresolvers.reverse("leads.views.detail", args=[4]))[2:])
        response = self.client.get(url)
        self.failUnlessEqual(response.status_code, 200)
        context = response.context[-1]
        self.failUnlessEqual(unicode(context["lead"]), u"World company : DSI  - laala")
        self.failUnlessEqual(unicode(context["user"]), "sre")
示例#49
0
    def test_create_lead(self):
        self.client.login(username=TEST_USERNAME, password=TEST_PASSWORD)
        lead = create_lead()
        self.failUnlessEqual(lead.staffing.count(), 0)
        self.failUnlessEqual(lead.staffing_list(), ", (JCF)")
        lead.staffing.add(Consultant.objects.get(pk=1))
        self.failUnlessEqual(lead.staffing.count(), 1)
        self.failUnlessEqual(len(lead.update_date_strf()), 14)
        self.failUnlessEqual(lead.staffing_list(), "SRE, (JCF)")
        self.failUnlessEqual(lead.short_description(), "A wonderfull lead th...")
        self.failUnlessEqual(urlresolvers.reverse("leads.views.detail", args=[4]), PREFIX + "/leads/4/")

        url = "".join(urlparse.urlsplit(urlresolvers.reverse("leads.views.detail", args=[4]))[2:])
        response = self.client.get(url)
        self.failUnlessEqual(response.status_code, 200)
        context = response.context[-1]
        self.failUnlessEqual(unicode(context["lead"]), u"World company : DSI  - laala")
        self.failUnlessEqual(unicode(context["user"]), "sre")
示例#50
0
 def _get_resources(self, target=None, base_url=None):
     try:
         if not isinstance(target, list):
             raise TypeError(
                 "keyword argument 'target' must be type '{}'".format(list))
         if not isinstance(base_url, str):
             raise TypeError(
                 "keyword argument 'base_url' must be type '{}'".format(
                     str))
         if not base_url.startswith('http'):
             raise ValueError(
                 "keyword argument 'base_url' must start with http:// or https://"
             )
         log('[*] Searching %s' % base_url)
         path = urlparse.urlsplit(base_url).path
         base = path.strip('/').replace('/', '.')
         names = [
             line.rpartition('</a>')[0].rpartition('>')[2].strip('/')
             for line in urlopen(base_url).read().splitlines()
             if 'href' in line if '</a>' in line
             if '__init__.py' not in line
         ]
         for n in names:
             name, ext = os.path.splitext(n)
             if ext in ('.py', '.pyc'):
                 module = '.'.join((base, name)) if base else name
                 if module not in target:
                     log("[+] Adding %s" % module)
                     target.append(module)
             elif not len(ext):
                 t = threading.Thread(target=self._get_resources,
                                      kwargs={
                                          'target': target,
                                          'base_url': '/'.join(
                                              (base_url, n))
                                      })
                 t.daemon = True
                 t.start()
             else:
                 resource = '/'.join((path, n))
                 if resource not in target:
                     target.append(resource)
     except Exception as e:
         log("{} error: {}".format(self._get_resources.func_name, str(e)))
示例#51
0
def main():
    main_directory()

    print("=====================================")
    print("Glot.io Code Downloader - Version %s" % (version))
    print("=====================================\n")

    url = raw_input("Insert the Snippet's URL: ")

    url_split = urlparse.urlsplit(url)
    if str(url_split[1]) != "glot.io":
        print(
            "\n[Glot.io Code Downloader] Error! Wrong website! Check the URL and try again!"
        )
        sys.exit(1)
    else:
        title, title_dir, filenames = code_information(url)
        code_directory(title_dir)
        get_codes(url, filenames)
示例#52
0
    def selected(self):
        """See zope.app.publisher.interfaces.browser.IBrowserMenuItem"""
        # --=mpj17=-- Not perfect, but it will work for now.
        normalized_action = self.action
        if self.action.startswith('@@'):
            normalized_action = self.action[2:]
        normalized_action = normalized_action.strip('/')

        rurl = self.request.getURL()
        scheme, netloc, path, query, frag = urlparse.urlsplit(rurl)
        if path.endswith('@@index.html'):
            path = path[:-12]
        path = path.strip('/')

        retval = (((normalized_action == '') and (path == ''))
                  or \
                  ((normalized_action != '') and \
                    path.startswith(normalized_action)))
        assert type(retval) == bool
        return retval