def validate_links(data): widgets = [Bar(), SimpleProgress()] pbar = ProgressBar(widgets=widgets, maxval=len(data)).start() for i, element in enumerate(data): url = element['url'] if url == '': continue scheme = urlparse.urlsplit(url).scheme host = urlparse.urlsplit(url).netloc if scheme in ('http', 'https') and \ url_status_cache.get(url) is not True: try: request = head(url, timeout=10) # some web sites cannot into head requests if request.status_code in (403, 405, 500) or \ host in ('mobil.morgenpost.de'): request = get(url) except Timeout as e: stderr.write('Connection to <%s> timeouted.\n' % url) exit(1) except ConnectionError as e: stderr.write('Connection to <%s> failed.\n' % url) stderr.write(str(e) + '\n') exit(1) if request.ok: url_status_cache.set(url, request.ok) else: stderr.write('<%s> is unreachable.\n' % url) exit(1) pbar.update(i + 1)
def build_Url(url, href): if re.search('logout',href) or re.search('action=out',href) or re.search('action=logoff', href) or re.search('action=delete',href) or re.search('UserLogout',href) or re.search('osCsid', href) or re.search('file_manager.php',href) or href=="http://localhost":#make exclusion list return '' parsed = urlparse.urlsplit(href) app='' if parsed[1] == urlparse.urlsplit(url)[1]: app=href else: if len(parsed[1]) == 0 and (len(parsed[2]) != 0 or len(parsed[3])!=0): domain = urlparse.urlsplit(url)[1] if re.match('/', parsed[2]): app = 'http://' + domain + parsed[2] if parsed[3]!='': app += '?'+parsed[3] else: try: app = 'http://' + domain + re.findall('(.*\/)[^\/]*', urlparse.urlsplit(url)[2])[0] + parsed[2] except IndexError: app = 'http://' + domain + parsed[2] if parsed[3]!='': app += '?'+parsed[3] return app
def validate_links(data): widgets = [Bar(), SimpleProgress()] pbar = ProgressBar(widgets=widgets, maxval=len(data)).start() for i, element in enumerate(data): url = element['url'] if url == '': continue scheme = urlparse.urlsplit(url).scheme host = urlparse.urlsplit(url).netloc if scheme in ('http', 'https') and \ url_status_cache.get(url) is not True: try: request = head(url, timeout=10) # some web sites cannot into head requests if request.status_code in (403, 405, 500) or \ host in ('mobil.morgenpost.de'): request = get(url) except Timeout as e: stderr.write('Connection to <%s> timeouted.\n' % url) exit(1) except ConnectionError as e: stderr.write('Connection to <%s> failed.\n' % url) stderr.write(str(e) + '\n') exit(1) if request.ok: url_status_cache.set(url, request.ok) else: stderr.write('<%s> is unreachable.\n' % url) exit(1) pbar.update(i+1)
def _send_header(self, header_pieces, headers, body, is_request): if not self.headers_prepared: body_length = len(body) had_length = False had_host = False if is_request: resource = header_pieces[1] splitted = urlparse.urlsplit(resource) url = splitted.path if splitted.query: url += '?' + splitted.query header_line = '%s %s HTTP/%s\r\n' % (header_pieces[0], url, header_pieces[2]) else: header_line = 'HTTP/%s %s %s\r\n' % header_pieces io_request = StringIO() io_request.write(header_line) for name, value in headers.iteritems(): if name == 'content-length': io_request.write('%s: %s\r\n' % (name.title(), body_length)) had_length = True else: io_request.write('%s: %s\r\n' % (name.title(), value)) if name == 'host': had_host = True if not had_length and body_length > 0: io_request.write('%s: %s\r\n' % ('Content-Length', body_length)) if not had_host and is_request: splitted = urlparse.urlsplit(resource) io_request.write('%s: %s\r\n' % ('Host', splitted.hostname)) io_request.write('\r\n') self.buffer = io_request.getvalue() io_request.close() self.headers_prepared = True self.to_write = len(self.buffer) self.written = 0 if not self.headers_sent: while self.to_write > 0: written = self.csock.send(self.buffer[self.written:]) self.written += written self.to_write -= written self.headers_sent = True
def download_content_list(detail_url, headers, timeout): """ sample url: http://weixin.sogou.com/gzhjs?openid=oIWsFt86NKeSGd_BQKp1GcDkYpv0&ext=D4y5Z3wUwj5uk6W7Yk9BqC3LAaFqirWHT5QFje14y0dip_leVhZF6qjo9Mm_UUVg&cb=sogou.weixin_gzhcb&page=1&gzhArtKeyWord=&tsn=0&t=1459425446419&_=1459425446169 其中openid是固定的 ext也是固定的 cb=sogou.weixin_gzhcb这个也是固定的 唯一变化的就是这个t以及_这2个字段,看上去是打开这个页面的时间戳 """ global start_flag total_records = 0 context_lst = [] _t = start_flag now = int(time.time() * 1000) url_netloc = urlparse.urlsplit(detail_url) cur_url = 'http://%s/gzhjs?%s' % (url_netloc.netloc, url_netloc.query) params = "cb=sogou.weixin_gzhcb&page=%s&gzhArtKeyWord=&tsn=0&t=%s&_=%s" query_url = cur_url + '&' + params for i in range(1, 11): target_url = query_url % (i, now, _t) print target_url resp = download_page(target_url, headers, timeout=DEFAULT_TIMEOUT) strip_text = resp.text.replace('sogou.weixin_gzhcb(', '') strip_text = strip_text[:len(strip_text)-1] context_lst.extend(json.loads(strip_text).get('items', [])) if not total_records: total_records = json.loads(strip_text).get('totalItems', 0) _t = _t + 1 time.sleep(2) return context_lst
def version_matcher(self, url): fname = os.path.basename(urlparse.urlsplit(url).path) version_match = re.search(r"([0-9]{2}.[0-9]{0,2}.[0-9]{0,2})", fname) if version_match == None: raise ProcessorError("Something went wrong matching FMP update to full version.") else: return version_match.group(1)
def _do_request(self, request_id, parameters={}): """ """ if request_id is None: # Generate a new request identifier using the class' default generator request_id = self.idgenerator.id() req_params = dict(parameters) req_params.update(dict( partner = self.partner, vendor = self.vendor, user = self.username, pwd = self.password, )) parmlist = self._build_parmlist(req_params) headers = { 'Host': urlparse.urlsplit(self.url_base)[1], 'X-VPS-REQUEST-ID': str(request_id), 'X-VPS-CLIENT-TIMEOUT': str(self.timeout), # Doc says to do this 'X-VPS-Timeout': str(self.timeout), # Example says to do this 'X-VPS-INTEGRATION-PRODUCT': self.CLIENT_IDENTIFIER, 'X-VPS-INTEGRATION-VERSION': self.API_VERSION, 'X-VPS-VIT-OS-NAME': sys.platform, 'Connection': 'close', 'Content-Type': 'text/namevalue', } self.log.debug(u'Request Headers: %s' % headers) try_count = 0 results = None while (results is None and try_count < self.MAX_RETRY_COUNT): try: try_count += 1 request = Request( url = self.url_base, data = parmlist.encode('utf-8'), headers = headers) response = urlopen(request) result_parmlist = response.read() response.close() self.log.debug( u'Result text: %s' % result_parmlist.decode('utf-8') ) results = self._parse_parmlist(result_parmlist) except Exception, e: if try_count < self.MAX_RETRY_COUNT: self.log.warn( u'API request attempt %s of %s failed - %%s' % ( try_count, self.MAX_RETRY_COUNT), e ) else: self.log.exception(u'Final API request failed - %s', e) raise e
def victimise(victim, uri): raw_url = victim + uri scheme, netloc, path, raw_query, fragment = urlparse.urlsplit(raw_url) query = urlparse.parse_qs(raw_query) url = urlparse.urlunsplit((scheme, netloc, path, urlencode(query, True), fragment)) print url http_client.fetch(url, fetch, use_gzip=False)
def login_proceed(request): """View that handles the successful login. """ template_name = '_user_login.html' # Check if the request came from logout page, if so set # authentication to redirect to home page referer_path = urlparse.urlsplit(request.META['HTTP_REFERER'])[2] if referer_path == reverse('auth_logout'): response = { 'authentication': 'success', 'redirect': reverse('home_page'), } elif referer_path == reverse('registration_activation_complete'): response = { 'authentication': 'success', 'redirect': reverse('view_profile'), } else: response = { 'authentication': 'success', 'markup': loader.render_to_string(template_name, RequestContext(request, {})) } json_response = json.dumps(response) return http.HttpResponse(json_response)
def download_metadata(target_directory): """ Downloads XML files for DOIs on stdin into given directory. """ stderr.write('Input DOIs, delimited by whitespace: ') dois = stdin.read().split() if len(dois) == 0: raise RuntimeError, 'No DOIs found.' stderr.write('Getting PubMed Central IDs for given DOIs … ') pmcids = _get_pmcids_from_dois(dois) if len(pmcids) == 0: raise RuntimeError, 'No PubMed Central IDs for given DOIs found.' stderr.write('found: %s\n' % ', '.join(pmcids)) url = _get_query_url_from_pmcids(pmcids) yield { 'url': url, 'completed': 0, 'total': 1 } url_path = urlparse.urlsplit(url).path local_filename = path.join(target_directory, \ url_path.split('/')[-1]) with open(local_filename, 'wb') as local_file: content = _get_file_from_pmcids(pmcids) local_file.write(content.read()) yield { 'url': url, 'completed': 1, 'total': 1 }
def validateURL (cls, full_url, video_item=True): """Make sure the url passed is in a valid form and return a video parser object""" if not isinstance (full_url, str): raise TypeError ("Argument must be a string") spliturl = urlparse.urlsplit (full_url) hostname = spliturl.hostname # print len (cls.parsers.keys ()) if not hostname: return None elif hostname.startswith ("www."): hostname = hostname.lstrip ("www.") if hostname not in cls.parsers: return None page_parser = cls.parsers[hostname].checkURL (full_url) if page_parser and video_item: youtube_video = VideoItem (page_parser) elif page_parser: youtube_video = page_parser else: youtube_video = None return youtube_video
def open_url(url, **kwargs): """ open_url(url, **kwargs) - open url and return file descriptor url - local file path or full url path. Allowed protocols are local file path, file, http and ftp kwargs - additional attributes according to protocol, 'mode' for local path and file protocol, 'proxy', 'data' and 'timeout' (Python >= 2.6) for http and ftp protocols Examples: open_url('/home/praetorian/secret.txt') open_url('file:///home/praetorian/secret.txt', mode='r') open_url('http://domain.tld/secret.txt', proxy='172:16:1:100:8000') open_url('ftp://domain.tld/secret.txt') """ bits = urlparse.urlsplit(url) attrs = kwargs if bits.scheme in ('', 'file'): url = bits.netloc + bits.path opener = open elif bits.scheme in ('http', 'ftp'): handlers = [] if 'proxy' in attrs: handlers.append(ProxyHandler({bits.scheme: attrs.pop('proxy')})) url = bits.geturl() opener = build_opener(*handlers).open else: raise URLError("Unsupported protocol '%s'" % bits.scheme) return opener(url, **attrs)
def test_site(self, browser, url): ''' Tests the given url using given browser object for login fields. Attempts to find real login url if not successful. Args: [browser] (Obj) The Browser object. [url] (str) The url to test. ''' login = False browser.visit(url) parsed_url = urlparse.urlsplit(url) if (parsed_url.path == "/" or not parsed_url.path)\ and not parsed_url.query: # check to see if login elements present on current page # if not, proceed to find real login url login_url = self.get_login_url(browser) if login_url and login_url.rstrip("/") != browser.url.rstrip("/"): browser.visit(login_url) # URL was clicked on, !ret else: print("XXXX failed to find login link for: ", browser.url) return False self.browser_url = browser.url return True return False
def download_metadata(target_directory): """ Downloads XML files for DOIs on stdin into given directory. """ stderr.write('Input DOIs, delimited by whitespace: ') dois = stdin.read().split() if len(dois) == 0: raise RuntimeError, 'No DOIs found.' stderr.write('Getting PubMed Central IDs for given DOIs … ') pmcids = _get_pmcids_from_dois(dois) if len(pmcids) == 0: raise RuntimeError, 'No PubMed Central IDs for given DOIs found.' stderr.write('found: %s\n' % ', '.join(pmcids)) url = _get_query_url_from_pmcids(pmcids) yield {'url': url, 'completed': 0, 'total': 1} url_path = urlparse.urlsplit(url).path local_filename = path.join(target_directory, \ url_path.split('/')[-1]) with open(local_filename, 'wb') as local_file: content = _get_file_from_pmcids(pmcids) local_file.write(content.read()) yield {'url': url, 'completed': 1, 'total': 1}
def download_metadata(target_directory): """ Downloads XML files for PMCIDs on stdin into given directory. """ stderr.write('Input PMCIDs, delimited by whitespace: ') pmcids = stdin.read().split() if len(pmcids) == 0: raise RuntimeError, 'No PMCIDs found.' # delete files from earlier invocations listing = listdir(target_directory) for filename in listing: file_path = path.join(target_directory, filename) stderr.write("Removing “%s” … " % file_path) remove(file_path) stderr.write("done.\n") # chunk function by nosklo, source: # <http://stackoverflow.com/questions/434287/what-is-the-most-pythonic-way-to-iterate-over-a-list-in-chunks#answer-434328> def chunker(seq, size): return (seq[pos:pos + size] for pos in xrange(0, len(seq), size)) for i, chunk in enumerate(chunker(pmcids, 365)): url = _get_query_url_from_pmcids(chunk) yield { 'url': url, 'completed': 0, 'total': 1 } url_path = urlparse.urlsplit(url).path local_filename = path.join(target_directory, \ url_path.split('/')[-1] + str(i)) with open(local_filename, 'wb') as local_file: content = _get_file_from_pmcids(chunk) local_file.write(content.read()) yield { 'url': url, 'completed': 1, 'total': 1 }
def pastebin(self, source, api_key=None): """ Dump file/data to Pastebin `Required` :param str source: data or filename `Optional` :param str api_key: Pastebin api_dev_key Returns URL of pastebin document as a string """ try: if api_key: info = { 'api_option': 'paste', 'api_paste_code': normalize(source), 'api_dev_key': api_key } paste = globals()['post']( 'https://pastebin.com/api/api_post.php', data=info) parts = urlparse.urlsplit(paste) return urlparse.urlunsplit( (parts.scheme, parts.netloc, '/raw' + parts.path, parts.query, parts.fragment)) if paste.startswith('http') else paste else: return "{} error: no pastebin API key".format( self.pastebin.func_name) except Exception as e: return '{} error: {}'.format(self.pastebin.func_name, str(e))
def submit(self, opener, res): """submit WAYF form with IDP :param opener: the urllib2 opener :param data: the form data as a dictionary :param res: the response object """ log.info("Submitting form to wayf") # Set IDP to correct IDP wayf_data = {} idp = self.idp data = self.data idps = {} for d in data["user_idp"]: if isinstance(data["user_idp"][d], dict): idps.update(data["user_idp"][d]) if not idp.get_idp() in idps: raise WAYFException("Can't find IdP '%s' in WAYF's IdP list" % idp) wayf_data["user_idp"] = idps[idp.get_idp()] wayf_data["Select"] = "Select" if data["form"]["action"].startswith("?"): urlsp = urlparse.urlsplit(res.url) urlsp = urlparse.urlunsplit((urlsp[0], urlsp[1], urlsp[2], "", "")) url = res.url + data["form"]["action"] else: url = urlparse.urljoin(res.url, data["form"]["action"]) data = urllib.urlencode(wayf_data) request = Request(url, data) log.debug("POST: %s" % request.get_full_url()) response = opener.open(request) return request, response
def set_language_ex(request): next = request.POST.get('next', request.GET.get('next')) if not is_safe_url(url=next, host=request.get_host()): next = request.META.get('HTTP_REFERER') if not is_safe_url(url=next, host=request.get_host()): next = '/' # remove lang from query scheme, netloc, path, query, fragment = urlparse.urlsplit(next) parsed_query = urlparse.parse_qsl(query) altered = False for k, v in parsed_query[:]: if LANG_GET_KEY == k: parsed_query.remove((k, v)) altered = True if altered: query = urllib.urlencode(parsed_query) next = urlparse.urlunsplit((scheme, netloc, path, query, fragment)) response = http.HttpResponseRedirect(next) if request.method == 'POST': lang_code = request.POST.get('language', None) if lang_code and check_for_language(lang_code): if hasattr(request, 'session'): request.session[LANGUAGE_SESSION_KEY] = lang_code else: response.set_cookie(settings.LANGUAGE_COOKIE_NAME, lang_code, max_age=settings.LANGUAGE_COOKIE_AGE, path=settings.LANGUAGE_COOKIE_PATH, domain=settings.LANGUAGE_COOKIE_DOMAIN) return response
def fetch(request): target = request.GET.get('url', None) if not target: response = HttpResponseBadRequest() return response to = request.GET.get('to', 'en') print('Translate %s to %s' % (target, to)) page = '' if not target.startswith('http'): target = 'http://' + target try: page = _fetch_link(target) except Exception: return HttpResponseServerError('Fetch %s failed' % target) parts = list(urlparse.urlsplit(target)) # clean path fragement and params parts[2] = '/' parts[3] = '' parts[4] = '' base = urlparse.urlunsplit(parts) try: translated = _translate(page, to, 'zh-CHS', base) except Exception as e: return HttpResponseServerError('Translate failed: %s' % e) return HttpResponse(translated)
def version_matcher(self, url): fname = os.path.basename(urlparse.urlsplit(url).path) version_match = re.search(r"([0-9]{2}.[0-9]{0,2}.[0-9]{0,2})", fname) if version_match is None: raise ProcessorError( "Something went wrong matching FMP update to full version.") else: return version_match.group(1)
def testIndexRedirect(self): if settings.SET_URL_ROOT_HANDLER: response = self.client.get('/') self.assertEquals(response.status_code, 302) # Documentation says that we must get response.headers, but # instead we have HttpResponseRedirect object here self.assertEquals(urlparse.urlsplit(response['Location'])[2], '/' + settings.BLOG_URLCONF_ROOT)
def make_requests_from_url(self, url): kw = self.macro.query(url) us = urlparse.urlsplit(url) qstr = dict(urlparse.parse_qsl(us.query)) base = urlparse.urlunsplit(us._replace(query='')) meta = {'keyword':kw} return FormRequest(base, formdata=qstr, method=self.start_method, headers=self.headers, cookies=self.cookies, dont_filter=True, meta=meta)
def _do_request(self, request_id, parameters={}): """ """ if request_id is None: # Generate a new request identifier using the class' default generator request_id = self.idgenerator.id() req_params = dict(parameters) req_params.update( dict( partner=self.partner, vendor=self.vendor, user=self.username, pwd=self.password, )) parmlist = self._build_parmlist(req_params) headers = { 'Host': urlparse.urlsplit(self.url_base)[1], 'X-VPS-REQUEST-ID': str(request_id), 'X-VPS-CLIENT-TIMEOUT': str(self.timeout), # Doc says to do this 'X-VPS-Timeout': str(self.timeout), # Example says to do this 'X-VPS-INTEGRATION-PRODUCT': self.CLIENT_IDENTIFIER, 'X-VPS-INTEGRATION-VERSION': self.API_VERSION, 'X-VPS-VIT-OS-NAME': sys.platform, 'Connection': 'close', 'Content-Type': 'text/namevalue', } self.log.debug(u'Request Headers: %s' % headers) try_count = 0 results = None while (results is None and try_count < self.MAX_RETRY_COUNT): try: try_count += 1 request = Request(url=self.url_base, data=parmlist.encode('utf-8'), headers=headers) response = urlopen(request) result_parmlist = response.read() response.close() self.log.debug(u'Result text: %s' % result_parmlist.decode('utf-8')) results = self._parse_parmlist(result_parmlist) except Exception, e: if try_count < self.MAX_RETRY_COUNT: self.log.warn( u'API request attempt %s of %s failed - %%s' % (try_count, self.MAX_RETRY_COUNT), e) else: self.log.exception(u'Final API request failed - %s', e) raise e
def generate_urls(obj, macro): try: if type(obj)==list: for url in obj: yield macro.expand(url) elif type(obj)==dict: base = macro.expand(obj['base'].encode('utf-8')) us = urlparse.urlsplit(base) qstr = dict(urlparse.parse_qsl(us.query)) qstr.update(obj.get('qstr', {})) base = urlparse.urlunsplit(us._replace(query='')) for k,v in qstr.iteritems(): if type(v)==dict and type(v['val'])==unicode: v = v['val'].encode(v.get('enc', 'utf-8'), errors='ignore') qstr[k] = macro.expand(v) if 'keywords' in obj: kw_obj = obj['keywords'] sub = kw_obj.get('sub') if sub: frm = sub.get('from') to = sub.get('to') sub = functools.partial(re.sub, frm, to) else: sub = lambda x:x for kw in load_keywords(kw_obj): if kw==MAGIC: yield 'http://0.0.0.0' continue key = kw_obj['name'].encode('utf-8') val = kw col = kw_obj.get('col', 0) sep = kw_obj.get('sep') if col>0: val = val.split(sep)[col-1] val = sub(val) if kw_obj.get('query', True): qstr.update({key:val}) url = base+'?'+urlencode(qstr) else: val = val.encode(kw_obj.get('enc', 'utf-8'), errors='ignore') if type(val)==unicode else str(val) url = base.replace(key, val)+'?'+urlencode(qstr) macro.update({'sep':sep}) macro.bind(url, kw) yield url else: url = base+'?'+urlencode(qstr) yield url except Exception as ex: log.msg(u'cannot generate urls: {}'.format(ex), level=log.ERROR) raise CloseSpider()
def urlStringToServers(urlString): " convert |-sep list of urls to list of hostnames " servers = set() urls = urlString.split("|") for url in urls: parts = urlparse.urlsplit(url) server = parts[1] server = server.replace("www.", "").strip() if server != "" and not "pubmedcentral" in server: servers.add(server) return servers
def urlStringToServers(urlString): " convert |-sep list of urls to list of hostnames " servers = set() urls = urlString.split("|") for url in urls: parts = urlparse.urlsplit(url) server = parts[1] server = server.replace("www.", "").strip() if server!="" and not "pubmedcentral" in server: servers.add(server) return servers
def delete_remote_file(): remote_filename = os.path.basename(urlparse.urlsplit(remote_url)[2]) remote_file = os.path.join(cfg_remotepath, remote_filename) ssh_retval = subprocess.call(["ssh", "-o", "PasswordAuthentication=no", "-o", "StrictHostKeyChecking=no", remotehost, "-f", "rm", "-f", remote_file], stdout=open(os.devnull), stderr=open(os.devnull)) if 0 != ssh_retval: print "Failed to delete remote file"
def getapodlist(url, picpath): feed = feedparser.parse(url) for item in feed["items"]: pic = item["description"] parseurl = urlparse.urlsplit(pic) outfile = parseurl.parse.split("/")[3] picfile = os.path.join(picpath, outfile) if os.path.isfile(picfile): pass else: urlretrieve(pic, picfile)
def _stager(options, **kwargs): util.display("\n[>]", color='green', style='bright', end=' ') util.display("Stager", color='reset', style='bright') assert 'url' in kwargs, "missing keyword argument 'url'" assert 'key' in kwargs, "missing keyword argument 'key'" assert 'var' in kwargs, "missing keyword argument 'var'" if options.encrypt: stager = open('core/stagers.py', 'r').read() + generators.main('run', url=kwargs['url'], key=kwargs['key']) else: stager = open('core/stagers.py', 'r').read() + generators.main('run', url=kwargs['url']) if not os.path.isdir('modules/stagers'): try: os.mkdir('modules/stagers') except OSError: util.log("Permission denied: unable to make directory './modules/stagers/'") if options.compress: util.display("\tCompressing stager... ", color='reset', style='normal', end=' ') __load__ = threading.Event() __spin__ = _spinner(__load__) output = generators.compress(stager) __load__.set() _update(stager, output, task='Compression') stager = output util.display("\tUploading stager... ", color='reset', style='normal', end=' ') __load__ = threading.Event() __spin__ = _spinner(__load__) if options.pastebin: assert options.pastebin, "missing argument 'pastebin' required for option 'pastebin'" url = util.pastebin(stager, options.pastebin) else: dirs = ['modules/stagers','byob/modules/stagers','byob/byob/modules/stagers'] dirname = '.' for d in dirs: if os.path.isdir(d): dirname = d path = os.path.join(os.path.abspath(dirname), kwargs['var'] + '.py' ) with open(path, 'w') as fp: fp.write(stager) s = 'http://{}:{}/{}'.format(options.host, int(options.port) + 1, pathname2url(path.replace(os.path.join(os.getcwd(), 'modules'), ''))) s = urlparse.urlsplit(s) url = urlparse.urlunsplit((s.scheme, s.netloc, os.path.normpath(s.path), s.query, s.fragment)).replace('\\','/') __load__.set() util.display("(hosting stager at: {})".format(url), color='reset', style='dim') return url
def getVideoID(videoURL): urlSplit = urlparse.urlsplit(videoURL) toParse = str(urlSplit[2]) try: videoID = toParse.split("/")[2] except Exception: print("[Heavy-R Downloader] Erro! Não foi possível extrair ID do vídeo!") print("[Heavy-R Downloader] Verifique se a URL do vídeo não está incompleta e tente novamente.") sys.exit(1) return videoID
def parse_recipe(cls, url): maker_dict = {'www.manjulaskitchen.com':ManjulasMaker, 'www.101cookbooks.com':OneCookMaker, 'www.gourmet.com':GourmetMaker} target_maker = urlparse.urlsplit(url)[1] current_maker = maker_dict[target_maker] #create child and call child's process_url method current_recipe = current_maker(url).process_url() #passes back to the caller what the child class passes back return current_recipe
def _get_resources(self, target=None, base_url=None): if sys.version_info[0] < 3: from urllib import urlretrieve from urllib2 import urlopen, urlparse import StringIO else: from urllib import parse as urlparse from urllib.request import urlopen, urlretrieve from io import StringIO try: if not isinstance(target, list): raise TypeError( "keyword argument 'target' must be type 'list'") if not isinstance(base_url, str): raise TypeError( "keyword argument 'base_url' must be type 'str'") if not base_url.startswith('http'): raise ValueError( "keyword argument 'base_url' must start with http:// or https://" ) log('[*] Searching %s' % base_url) path = urlparse.urlsplit(base_url).path base = path.strip('/').replace('/', '.') names = [] for line in urlopen(base_url).read().splitlines(): line = str(line) if 'href' in line and '</a>' in line and '__init__.py' not in line: names.append( line.rpartition('</a>')[0].rpartition('>')[2].strip( '/')) for n in names: name, ext = os.path.splitext(n) if ext in ('.py', '.pyc'): module = '.'.join((base, name)) if base else name if module not in target: log("[+] Adding %s" % module) target.append(module) elif not len(ext): t = threading.Thread(target=self._get_resources, kwargs={ 'target': target, 'base_url': '/'.join( (base_url, n)) }) t.daemon = True t.start() else: resource = '/'.join((path, n)) if resource not in target: target.append(resource) except Exception as e: log("{} error: {}".format(self._get_resources.__name__, str(e)))
def run(self): for oneline in self.reader: thumb_hash, URL, priority = oneline.split() + [0] prority = int(priority) one_file = BackupFile() one_file.thumb_hash = thumb_hash one_file.name = os.path.basename(urlparse.urlsplit(URL).path) EXT = URL[-3:].upper()#FLASHAIR seems to only allow three-letter extensions if self.ORM.exists(one_file): print "file already exists!" continue #Nothing has been downloaded yet. #The thumb_hash must have been unique. tmp_filename = os.path.join(self.tmp_dir, thumb_hash + "." + EXT) retval = sp.call("curl '%s' > '%s'" % (URL, tmp_filename), shell=True) if retval != 0: continue #there was a problem. maybe try again later. md5sum = None try: #TODO: md5 of only the first few MB of large files. (Or the last few, to guard against truncated files.) # md5out = sp.check_output("md5sum %s" % tmp_filename, shell=True) md5sum, other = md5out.split()[:2] md5sum = md5sum.strip() except: pass one_file.full_hash = md5sum if self.ORM.exists(one_file): try: os.remove(tmp_filename) except: #maybe its alreayd gone? pass continue #Does not exist in the database #TODO: Set the creation date of the file (read that on input?) #TODO: Folders based on creation date #TODO: new_name = os.path.join(self.base_dir, one_file.name) shutil.move(tmp_filename, new_name) self.ORM.store(one_file)
def download_modules(self): mkdir_p(self.name) primary_urlobj = urlparse.urlsplit(primary_url) full_category_url = primary_url + self.url cat_soup = bs(urlopen(full_category_url)) for row in cat_soup.find('table').find_all('tr'): try: mod = ModuleEntry(self, row, '%s://%s' % (primary_urlobj.scheme, primary_urlobj.netloc)) mod.start() self.logger.debug('category.mod: %s', mod) self.mods.append(mod) except FlowException: pass
def replace_type_chunk(self, offset): o_loc = Locations.objects.order_by('id')[offset:offset + self.step_limit] loc_type = dict(APP_CONTENTS_LOC_TYPE) for item in o_loc: pattern = urlparse.urlsplit(item.url_view).netloc for i in loc_type.keys(): if i in pattern.replace('.',''): item.type = i item.save() break
def validator(form, value): parts = urlparse.urlsplit(value) if parts.scheme not in ['http', 'https']: raise ValidationError(lazy_gettext(u'URLs must be of type '\ u'http or https.')) elif parts.fragment: raise ValidationError(lazy_gettext(u'URLs may not have a '\ u'#reference part.')) elif parts.netloc.find('@') != -1: raise ValidationError(lazy_gettext(u'URLs should not be specified '\ u'with username and password.')) elif parts.query: raise ValidationError(lazy_gettext(u'URLs may not have a ?query.'))
def asciify_url(url, force_quote=False): r"""Attempts to make a unicode url usuable with ``urllib/urllib2``. More specifically, it attempts to convert the unicode object ``url``, which is meant to represent a IRI, to an unicode object that, containing only ASCII characters, is a valid URI. This involves: * IDNA/Puny-encoding the domain name. * UTF8-quoting the path and querystring parts. See also RFC 3987. """ assert type(url) == unicode parts = urlparse.urlsplit(url) if not parts.scheme or not parts.netloc: # apparently not an url return url # idna-encode domain hostname = parts.hostname.encode('idna') # UTF8-quote the other parts. We check each part individually if # if needs to be quoted - that should catch some additional user # errors, say for example an umlaut in the username even though # the path *is* already quoted. def quote(s, safe): s = s or '' # Triggers on non-ascii characters - another option would be: # urllib.quote(s.replace('%', '')) != s.replace('%', '') # which would trigger on all %-characters, e.g. "&". if s.encode('ascii', 'replace') != s or force_quote: return urllib.quote(s.encode('utf8'), safe=safe) return s username = quote(parts.username, '') password = quote(parts.password, safe='') path = quote(parts.path, safe='/') query = quote(parts.query, safe='&=') # put everything back together netloc = hostname if username or password: netloc = '@' + netloc if password: netloc = ':' + password + netloc netloc = username + netloc if parts.port: netloc += ':' + str(parts.port) return urlparse.urlunsplit( [parts.scheme, netloc, path, query, parts.fragment])
def replace_type_chunk(self, offset): o_loc = Locations.objects.order_by('id')[offset:offset + self.step_limit] loc_type = dict(APP_CONTENTS_LOC_TYPE) for item in o_loc: pattern = urlparse.urlsplit(item.url_view).netloc for i in loc_type.keys(): if i in pattern.replace('.', ''): item.type = i item.save() break
def victimise(victim, request): try: lines = request.split('\n') uri = lines[0].split(' ')[1] body = lines[-1] raw_url = victim + uri scheme, netloc, path, raw_query, fragment = urlparse.urlsplit(raw_url) query = urlparse.parse_qs(raw_query) url = urlparse.urlunsplit((scheme, netloc, path, urlencode(query, True), fragment)) if body: http_client.fetch(url, fetch, method="POST", body=body, use_gzip=False) except: pass
def parse_recipe(cls, url): maker_dict = { 'www.manjulaskitchen.com': ManjulasMaker, 'www.101cookbooks.com': OneCookMaker, 'www.gourmet.com': GourmetMaker } target_maker = urlparse.urlsplit(url)[1] current_maker = maker_dict[target_maker] #create child and call child's process_url method current_recipe = current_maker(url).process_url() #passes back to the caller what the child class passes back return current_recipe
def asciify_url(url, force_quote=False): r"""Attempts to make a unicode url usuable with ``urllib/urllib2``. More specifically, it attempts to convert the unicode object ``url``, which is meant to represent a IRI, to an unicode object that, containing only ASCII characters, is a valid URI. This involves: * IDNA/Puny-encoding the domain name. * UTF8-quoting the path and querystring parts. See also RFC 3987. """ assert type(url) == unicode parts = urlparse.urlsplit(url) if not parts.scheme or not parts.netloc: # apparently not an url return url # idna-encode domain hostname = parts.hostname.encode('idna') # UTF8-quote the other parts. We check each part individually if # if needs to be quoted - that should catch some additional user # errors, say for example an umlaut in the username even though # the path *is* already quoted. def quote(s, safe): s = s or '' # Triggers on non-ascii characters - another option would be: # urllib.quote(s.replace('%', '')) != s.replace('%', '') # which would trigger on all %-characters, e.g. "&". if s.encode('ascii', 'replace') != s or force_quote: return urllib.quote(s.encode('utf8'), safe=safe) return s username = quote(parts.username, '') password = quote(parts.password, safe='') path = quote(parts.path, safe='/') query = quote(parts.query, safe='&=') # put everything back together netloc = hostname if username or password: netloc = '@' + netloc if password: netloc = ':' + password + netloc netloc = username + netloc if parts.port: netloc += ':' + str(parts.port) return urlparse.urlunsplit([ parts.scheme, netloc, path, query, parts.fragment])
def getBoardAndThread(threadURL): if threadURL == None: threadURL = raw_input("[Check'em] Insert the thread's URL: ") print("") url_split = urlparse.urlsplit(threadURL) if str(url_split[1]) != "boards.4chan.org" and str(url_split[1]) != "4chan.org": print("[Check'em] Very funny retard, wrong website!") sys.exit(1) else: toParse = str(url_split[2]) board = toParse.split("/")[1] threadID = toParse.split("/")[3] return "https://a.4cdn.org/%s/thread/%s.json" % (board, threadID)
def main(self): try: url = "" defined_version = self.env.get("version") update = self.getLatestFilemakerProAdvancedInstaller(defined_version) version_str = self.env.get("major_version") update["version"] = self.version_matcher(update["url"]) url = update["url"] self.output("URL found '%s'" % url, verbose_level=2) self.env["version"] = update["version"] self.env["url"] = url self.env["package_name"] = update["name"] self.env["package_file"] = os.path.basename(urlparse.urlsplit(url).path) except BaseException as err: # handle unexpected errors here raise ProcessorError(err)
def from_native(self, value): if type(value) is dict: encoding = value.get('encoding', 'base64') filename = value.get('filename', '') content = value['content'] if encoding == 'base64': content = base64.b64decode(content) elif encoding == 'url': try: res = requests.get(content, stream=True) except (MissingSchema, InvalidSchema, InvalidURL), e: raise ValidationError(smart_unicode(e)) if status.is_success(res.status_code): if not filename: filename = basename(urlparse.urlsplit(content)[2]) content = res.content value = ContentFile(content, name=filename)
def main(): main_directory() print("=====================================") print("Glot.io Code Downloader - Version %s" % (version)) print("=====================================\n") url = raw_input("Insert the Snippet's URL: ") url_split = urlparse.urlsplit(url) if str(url_split[1]) != "glot.io": print("\n[Glot.io Code Downloader] Error! Wrong website! Check the URL and try again!") sys.exit(1) else: title, title_dir, filenames = code_information(url) code_directory(title_dir) get_codes(url, filenames)
def main(self): try: url = "" defined_version = self.env.get("version") update = self.getLatestFilemakerProAdvancedInstaller( defined_version) version_str = self.env.get("major_version") update["version"] = self.version_matcher(update["url"]) url = update["url"] self.output("URL found '%s'" % url, verbose_level=2) self.env["version"] = update["version"] self.env["url"] = url self.env["package_name"] = update["name"] self.env["package_file"] = os.path.basename( urlparse.urlsplit(url).path) except Exception as err: # handle unexpected errors here raise ProcessorError(err)
def test_create_lead(self): self.client.login(username=TEST_USERNAME, password=TEST_PASSWORD) lead = create_lead() self.failUnlessEqual(lead.staffing.count(), 0) self.failUnlessEqual(lead.staffing_list(), ", (JCF)") lead.staffing.add(Consultant.objects.get(pk=1)) self.failUnlessEqual(lead.staffing.count(), 1) self.failUnlessEqual(len(lead.update_date_strf()), 14) self.failUnlessEqual(lead.staffing_list(), "SRE, (JCF)") self.failUnlessEqual(lead.short_description(), "A wonderfull lead th...") self.failUnlessEqual(urlresolvers.reverse("leads.views.detail", args=[4]), PREFIX + "/leads/4/") url = "".join(urlparse.urlsplit(urlresolvers.reverse("leads.views.detail", args=[4]))[2:]) response = self.client.get(url) self.failUnlessEqual(response.status_code, 200) context = response.context[-1] self.failUnlessEqual(unicode(context["lead"]), u"World company : DSI - laala") self.failUnlessEqual(unicode(context["user"]), "sre")
def _get_resources(self, target=None, base_url=None): try: if not isinstance(target, list): raise TypeError( "keyword argument 'target' must be type '{}'".format(list)) if not isinstance(base_url, str): raise TypeError( "keyword argument 'base_url' must be type '{}'".format( str)) if not base_url.startswith('http'): raise ValueError( "keyword argument 'base_url' must start with http:// or https://" ) log('[*] Searching %s' % base_url) path = urlparse.urlsplit(base_url).path base = path.strip('/').replace('/', '.') names = [ line.rpartition('</a>')[0].rpartition('>')[2].strip('/') for line in urlopen(base_url).read().splitlines() if 'href' in line if '</a>' in line if '__init__.py' not in line ] for n in names: name, ext = os.path.splitext(n) if ext in ('.py', '.pyc'): module = '.'.join((base, name)) if base else name if module not in target: log("[+] Adding %s" % module) target.append(module) elif not len(ext): t = threading.Thread(target=self._get_resources, kwargs={ 'target': target, 'base_url': '/'.join( (base_url, n)) }) t.daemon = True t.start() else: resource = '/'.join((path, n)) if resource not in target: target.append(resource) except Exception as e: log("{} error: {}".format(self._get_resources.func_name, str(e)))
def main(): main_directory() print("=====================================") print("Glot.io Code Downloader - Version %s" % (version)) print("=====================================\n") url = raw_input("Insert the Snippet's URL: ") url_split = urlparse.urlsplit(url) if str(url_split[1]) != "glot.io": print( "\n[Glot.io Code Downloader] Error! Wrong website! Check the URL and try again!" ) sys.exit(1) else: title, title_dir, filenames = code_information(url) code_directory(title_dir) get_codes(url, filenames)
def selected(self): """See zope.app.publisher.interfaces.browser.IBrowserMenuItem""" # --=mpj17=-- Not perfect, but it will work for now. normalized_action = self.action if self.action.startswith('@@'): normalized_action = self.action[2:] normalized_action = normalized_action.strip('/') rurl = self.request.getURL() scheme, netloc, path, query, frag = urlparse.urlsplit(rurl) if path.endswith('@@index.html'): path = path[:-12] path = path.strip('/') retval = (((normalized_action == '') and (path == '')) or \ ((normalized_action != '') and \ path.startswith(normalized_action))) assert type(retval) == bool return retval