def consolidationThread():
	global mydata
	global remoteHosts
	print "Starting consolidationThread"
	while True:
		for host in remoteHosts:
			try:

				host = "//"+host # must be cleaner way to do this
				remote_address = (urlparse(host).hostname,urlparse(host).port)
				#print "Trying: ",remote_address
				sock = socket.socket(socket.AF_INET,socket.SOCK_STREAM)
				sock.settimeout(5) 	# could be longer if not on local network

				sock.connect(remote_address)

				sock.sendall("{\"numberOfRecords\":1,\"version\":1}\n")

				data = sock.recv(1024)

				decoded_data = json.loads(data)
				if int(decoded_data['RelativeTime']) < ((int(time.time()) * 1000) - int(mydata['CaptureDateTime'])):

					#print "Received NEWER: ",data
					print "NEWEST FROM: ",remote_address
					mydata = decoded_data

				sock.close()

			except Exception, e:
				print e,remote_address



		time.sleep(10)
Пример #2
0
def vk(url):
    try:
        try: oid, id = urlparse.parse_qs(urlparse.urlparse(url).query)['oid'][0] , urlparse.parse_qs(urlparse.urlparse(url).query)['id'][0]
        except: oid, id = re.compile('\/video(.*)_(.*)').findall(url)[0]
        try: hash = urlparse.parse_qs(urlparse.urlparse(url).query)['hash'][0]
        except: hash = vk_hash(oid, id)

        u = 'http://api.vk.com/method/video.getEmbed?oid=%s&video_id=%s&embed_hash=%s' % (oid, id, hash)
 
        result = client.request(u)
        result = re.sub(r'[^\x00-\x7F]+',' ', result)

        try: result = json.loads(result)['response']
        except: result = vk_private(oid, id)

        url = []
        try: url += [{'quality': 'HD', 'url': result['url720']}]
        except: pass
        try: url += [{'quality': 'SD', 'url': result['url540']}]
        except: pass
        try: url += [{'quality': 'SD', 'url': result['url480']}]
        except: pass
        if not url == []: return url
        try: url += [{'quality': 'SD', 'url': result['url360']}]
        except: pass
        if not url == []: return url
        try: url += [{'quality': 'SD', 'url': result['url240']}]
        except: pass

        if not url == []: return url

    except:
        return
Пример #3
0
    def _format_url(self, url):

        if not url:
            return self.root_url

        dir_sep = self.options['directory_separator']

        parsed_url = urlparse(url)

        if not parsed_url.netloc and not parsed_url.scheme:
            url = self.root_url + dir_sep + url.lstrip(dir_sep)
            parsed_url = urlparse(url)

        formatted_url = parsed_url.scheme + '://'

        domain = parsed_url.netloc
        for subdomain in self.options['redundant_subdomains']:
            domain = re.sub(r'^' + subdomain + r'\.', '', domain, flags=re.IGNORECASE)

        formatted_url += domain

        path = parsed_url.path.rstrip(dir_sep)

        formatted_url += path

        return formatted_url
Пример #4
0
def sitemap_parse(sitemap_option, astring, google_results, website_url):
    not_indexed = []
    not_sitemap = []
    error = ''
    sitemap_results = []
    website_host = urlparse(website_url).scheme
    if website_host != '':
        website_url = urlparse(website_url).scheme + "://" + urlparse(website_url).netloc
    if website_url[-1] != '/':
        website_url += '/'
    if astring != '':
        if sitemap_option == 'sitemap':

            resp = requests.get(astring)
            soup = Soup(resp.content)

        elif sitemap_option == 'upload_sitemap':

            soup = Soup(astring)
        urls = soup.findAll('url')
        for u in urls:
            loc = u.find('loc').string
            sitemap_results.append(loc)
            if loc not in google_results:
                not_indexed.append(loc)
        for loc in google_results:
            if loc not in sitemap_results:
                not_sitemap.append(loc)
    return not_indexed, not_sitemap, error
Пример #5
0
	def __call__(self, url, count_of_crawler):
		"""
		Function which fetch the content from the given URL and collect all the
		URL in the content and pass the first url of the page to fetch the
		content.
		"""
		try:
			page = urllib2.urlopen(url)
			soup = BeautifulSoup(page.read())	

			links_on_page = map(lambda anchor: anchor.get('href'), 
						soup.find_all('a'))

			cleaned_url = map(lambda link: link if urlparse(link).scheme 
	 				and urlparse(url).netloc else (urlparse(url)
					.scheme+"://"+urlparse(url).netloc+link if 
					link[0] == "/" else url+link), links_on_page)
			visited_url.append(url)
			total_collected_url.append(cleaned_url)
			next_url_to_visit = [next_url for next_url in cleaned_url\
				 if not next_url in visited_url and not "#" in next_url][0]
		
			if count_of_crawler and next_url_to_visit:	
				count_of_crawler = crawler(next_url_to_visit, 
								count_of_crawler-1)
	
		except:
			print "It seems there is some issue in URL "+url
	
		return count_of_crawler
  def local_(masteruri, org_masteruri, uri):
    '''
    Test the node whether it's run on the same machineas the ROS master and ``masteruri`` and ``org_masteruri`` are equal.
    
    :param masteruri: The URI of the ROS master currently tested.

    :type masteruri: str

    :param org_masteruri: The URI of the ROS master, where the node was originally registered.

    :type org_masteruri: str

    :param uri: The URI of the node.

    :type uri: str

    :rtype: bool
    '''
    result = False
    try:
      from urlparse import urlparse
      om = urlparse(masteruri)
      on = urlparse(uri)
      result = (om.hostname == on.hostname) and (masteruri == org_masteruri)
    except:
      pass
    return result
Пример #7
0
def ConfigureHostnames(config):
  """This configures the hostnames stored in the config."""
  if flags.FLAGS.external_hostname:
    hostname = flags.FLAGS.external_hostname
  else:
    try:
      hostname = socket.gethostname()
    except (OSError, IOError):
      print "Sorry, we couldn't guess your hostname.\n"

    hostname = RetryQuestion("Please enter your hostname e.g. "
                             "grr.example.com", "^[\\.A-Za-z0-9-]+$", hostname)

  print """\n\n-=Server URL=-
The Server URL specifies the URL that the clients will connect to
communicate with the server. For best results this should be publicly
accessible. By default this will be port 8080 with the URL ending in /control.
"""
  frontend_url = RetryQuestion("Frontend URL", "^http://.*/$",
                               "http://%s:8080/" % hostname)
  config.Set("Client.server_urls", [frontend_url])

  frontend_port = urlparse.urlparse(frontend_url).port or config_lib.CONFIG.Get(
      "Frontend.bind_port")
  config.Set("Frontend.bind_port", frontend_port)

  print """\n\n-=AdminUI URL=-:
The UI URL specifies where the Administrative Web Interface can be found.
"""
  ui_url = RetryQuestion("AdminUI URL", "^http[s]*://.*$",
                         "http://%s:8000" % hostname)
  config.Set("AdminUI.url", ui_url)
  ui_port = urlparse.urlparse(ui_url).port or config_lib.CONFIG.Get(
      "AdminUI.port")
  config.Set("AdminUI.port", ui_port)
Пример #8
0
    def test_enketo_remote_server_responses(self):
        #just in case if we want to shift the testing back to the main server
        testing_enketo_url = settings.ENKETO_URL
        #testing_enketo_url = 'http://enketo-dev.formhub.org'
        form_id = "test_%s" % re.sub(re.compile("\."), "_", str(time()))
        server_url = "%s/%s" % (self.base_url, self.user.username)
        enketo_url = '%slaunch/launchSurvey' % testing_enketo_url

        values = {
            'format': 'json',
            'form_id': form_id,
            'server_url': server_url
        }
        data = urllib.urlencode(values)
        req = urllib2.Request(enketo_url, data)
        try:
            response = urllib2.urlopen(req)
            response = json.loads(response.read())
            success = response['success']
            if not success and 'reason' in response:
                fail_msg = "This enketo installation is for use by "\
                    "formhub.org users only."
                if response['reason'].startswith(fail_msg):
                    raise SkipTest
            return_url = response['url']
            success = response['success']
            self.assertTrue(success)
            enketo_base_url = urlparse(settings.ENKETO_URL).netloc
            return_base_url = urlparse(return_url).netloc
            self.assertIn(enketo_base_url, return_base_url)
        except urllib2.URLError:
            self.assertTrue(False)

        #second time
        req2 = urllib2.Request(enketo_url, data)
        try:
            response2 = urllib2.urlopen(req2)
            response2 = json.loads(response2.read())
            return_url_2 = response2['url']
            success2 = response2['success']
            reason2 = response2['reason']
            self.assertEqual(return_url, return_url_2)
            self.assertFalse(success2)
            self.assertEqual(reason2, "existing")
        except urllib2.URLError:
            self.assertTrue(False)

        #error message
        values['server_url'] = ""
        data = urllib.urlencode(values)
        req3 = urllib2.Request(enketo_url, data)
        try:
            response3 = urllib2.urlopen(req3)
            response3 = json.loads(response3.read())
            success3 = response3['success']
            reason3 = response3['reason']
            self.assertFalse(success3)
            self.assertEqual(reason3, "empty")
        except urllib2.URLError:
            self.assertTrue(False)
    def checkRedir(self, orig_path):
        # old_url = portal_url+item['_orig_path']
        # XXX: referers to target and not portal
        old_url = self.target + orig_path

        # this downloads file. We need a way to do this without the download
        _, host, targetpath, _, _, _ = urlparse.urlparse(self.target)
        if "@" in host:
            auth, host = host.split("@")
        else:
            auth = None

        conn = httplib.HTTPConnection(host)
        headers = {}
        if auth:
            auth = "Basic " + string.strip(base64.encodestring(auth))
            headers["Authorization"] = auth
        # /view is a hack as zope seems to send all content on head request
        conn.request("HEAD", targetpath + orig_path, headers=headers)
        res = conn.getresponse()
        redir = res.status == 301
        if redir and res.getheader("location"):
            _, _, oldpath, _, _, _ = urlparse.urlparse(res.getheader("location"))
            parts = oldpath.split("/")
            if parts[-1] == "view":
                parts = parts[:-1]
            return "/".join(parts)
        if res.status == 200:
            return orig_path
        return None
Пример #10
0
    def _cal_depth(self, url):
        # calculate depth of a given URL, return tuple (url, depth)
        if url.find('#') >= 0: url = url[:url.find('#')]    # cut off fragment
        if url.find('?') >= 0: url = url[:url.find('?')]    # cut off query string
        if url.startswith('//'):
            return '', 10000    # //www.baidu.com/index.php, ignored
        if not urlparse.urlparse(url, 'http').scheme.startswith('http'):
            return '', 10000    # no HTTP protocol, ignored

        if url.startswith('http'):
            _ = urlparse.urlparse(url, 'http')
            if _.netloc == self.host:    # same hostname
                url = _.path
            else:
                return '', 10000         # not same hostname, ignored
        while url.find('//') >= 0:
            url = url.replace('//', '/')

        if not url:
            return '/', 1         # http://www.example.com

        if url[0] != '/': url = '/' + url
        url = url[: url.rfind('/')+1]
        depth = url.count('/')
        return url, depth
Пример #11
0
def absolute_url(url, base_href):
    """
    >>> absolute_url('foo', 'http://base/whatever/ooo/fdsh')
    'http://base/whatever/ooo/foo'

    >>> absolute_url('foo/bar/', 'http://base')
    'http://base/foo/bar/'

    >>> absolute_url('/foo/bar', 'http://base/whatever/fdskf')
    'http://base/foo/bar'

    >>> absolute_url('\\n/foo/bar', 'http://base/whatever/fdskf')
    'http://base/foo/bar'

    >>> absolute_url('http://localhost/foo', 'http://base/whatever/fdskf')
    'http://localhost/foo'
    """
    url = url.strip()
    proto = urlparse(url)[0]
    if proto:
        return url

    base_url_parts = urlparse(base_href)
    base_server = '://'.join(base_url_parts[:2])
    if url.startswith('/'):
        return base_server + url
    else:
        path = base_url_parts[2]
        if '/' in path:
            path = path.rsplit('/', 1)[0] + '/'
        else:
            path = '/'
        return base_server + path + url
Пример #12
0
        def scan_locs_task():
            """Yield a task to calculate the dependencies of the sitemap.

            Other tasks can depend on this output, instead of having
            to scan locations.
            """
            scan_locs()

            # Generate a list of file dependencies for the actual generation
            # task, so rebuilds are triggered.  (Issue #1032)
            output = kw["output_folder"]
            file_dep = []

            for i in urlset.keys():
                p = os.path.join(output, urlparse(i).path.replace(base_path, '', 1))
                if not p.endswith('sitemap.xml') and not os.path.isdir(p):
                    file_dep.append(p)
                if os.path.isdir(p) and os.path.exists(os.path.join(p, 'index.html')):
                    file_dep.append(p + 'index.html')

            for i in sitemapindex.keys():
                p = os.path.join(output, urlparse(i).path.replace(base_path, '', 1))
                if not p.endswith('sitemap.xml') and not os.path.isdir(p):
                    file_dep.append(p)
                if os.path.isdir(p) and os.path.exists(os.path.join(p, 'index.html')):
                    file_dep.append(p + 'index.html')

            return {'file_dep': file_dep}
Пример #13
0
    def getRepStr(self):
        urlList1 = [ i for i in self.url1.replace('http://', '').split('/') if i]
        urlList2 = [ i for i in self.url2.replace('http://', '').split('/') if i]
#        print urlList1
#        print urlList2

        n = 0
        while True:
            if urlList1[:n]==urlList2[:n]:
                n+=1
                if n>10:
                    break
                continue
            break

        urlPart = 'http://'+'/'.join(urlList1[:n-1])
        if urlparse(urlPart).netloc and ('.' not in urlparse(urlPart).path):
            urlPart += '/'

        urlListLen = len(urlList1[n-1:])

        if urlListLen<1:
            return (urlPart, './')

        if urlListLen>=1:
            return (urlPart, urlListLen*'../', self.url1, self.url2)
Пример #14
0
 def snapshot(self, newURLs, fetchedURLFragments = []):
     """ main method that crawls and saves html snapshots by recursive calls """
     if newURLs is None or len(newURLs) == 0:
         return 0
     fetchedURLFragments = fetchedURLFragments[:] # needed to clone solve reference problems
     # logging.debug("URLs: %s, fetchedURLFragments: %s" % (newURLs, fetchedURLFragments))
     processedURLCount = 0
     if len(newURLs) == 0:
         return 0
     if self.domain == None:
         self.domain = urlparse(newURLs[0]).netloc
         logging.debug("first URL is '%s', setting valid domain as '%s'" % (newURLs[0], self.domain))
     newURLFragments = [urlparse(newURL).fragment[1:] for newURL in newURLs if newURL.find("#!") != -1 and (urlparse(newURL).netloc == self.domain or urlparse(newURL).netloc == '')]
     logging.debug("found %d valid URLs to process" % (len(newURLFragments)))
     # ignored url path, because we don't need it for now
     # stripped ! character from fragment
     if len(newURLFragments) == 0:
         logging.warn("only URLs with #! hashbang are valid!")
         return 0
     for newURLFragment in newURLFragments:
         if newURLFragment in fetchedURLFragments:
             logging.debug("URL-'%s' was fetched before", newURLFragment)
             continue
         newURL = "http://" + self.domain + "#!" + newURLFragment
         logging.info("fetching URL-'%s'" % (newURL))
         fetchedURLFragments.append(newURLFragment)
         response = os.popen(self.snapshot_cmd % (newURL)).read()
         self.saveResponse(newURLFragment, response)
         foundURLs = self.extractHrefsFromHTML(response)
         self.snapshot(foundURLs, fetchedURLFragments)
         processedURLCount += 1
     return processedURLCount
Пример #15
0
 def __init__(self, layer, mapfile, fonts=None):
     """ Initialize Mapnik provider with layer and mapfile.
         
         XML mapfile keyword arg comes from TileStache config,
         and is an absolute path by the time it gets here.
     """
     maphref = urljoin(layer.config.dirpath, mapfile)
     scheme, h, path, q, p, f = urlparse(maphref)
     
     if scheme in ('file', ''):
         self.mapfile = path
     else:
         self.mapfile = maphref
     
     self.layer = layer
     self.mapnik = None
     
     engine = mapnik.FontEngine.instance()
     
     if fonts:
         fontshref = urljoin(layer.config.dirpath, fonts)
         scheme, h, path, q, p, f = urlparse(fontshref)
         
         if scheme not in ('file', ''):
             raise Exception('Fonts from "%s" can\'t be used by Mapnik' % fontshref)
     
         for font in glob(path.rstrip('/') + '/*.ttf'):
             engine.register_font(str(font))
Пример #16
0
def provision():
    if exec_ctx == 'spark_ec2':
        eggo.spark_ec2.provision()
    elif exec_ctx == 'director':
        eggo.director.provision()
    # at this point, get_master() should be valid

    # if the DFS is on the local fs, the directories may need to be created
    url = urlparse(eggo_config.get('dfs', 'dfs_root_url'))
    if url.scheme == 'file':
        local('mkdir -p {0}'.format(url.path))
        url = urlparse(eggo_config.get('dfs', 'dfs_raw_data_url'))
        local('mkdir -p {0}'.format(url.path))
        url = urlparse(eggo_config.get('dfs', 'dfs_tmp_data_url'))
        local('mkdir -p {0}'.format(url.path))

    # tag all the provisioned instances
    if exec_ctx in ['spark_ec2', 'director']:
        conn = connect_to_region(eggo_config.get(exec_ctx, 'region'))
        instances = conn.get_only_instances(
            filters={'key-name': [eggo_config.get('aws', 'ec2_key_pair')]})
        for instance in instances:
            instance.add_tag('owner', getuser())
            instance.add_tag('stack_name',
                             eggo_config.get(exec_ctx, 'stack_name'))
Пример #17
0
    def __init__(self, uri, consumer, extra_headers=None):
        asyncore.dispatcher_with_send.__init__(self)

        # turn the uri into a valid request
        scheme, host, path, params, query, fragment = urlparse.urlparse(uri)

        # use origin host
        self.host = host

        # get proxy settings, if any
        proxy = self.proxies.get(scheme)
        if proxy:
            scheme, host, x, x, x, x = urlparse.urlparse(proxy)

        assert scheme == "http", "only supports HTTP requests (%s)" % scheme

        if not path:
            path = "/"
        if params:
            path = path + ";" + params
        if query:
            path = path + "?" + query
        if proxy:
            path = scheme + "://" + self.host + path

        self.path = path

        # get port number
        try:
            host, port = host.split(":", 1)
            port = int(port)
        except (TypeError, ValueError):
            port = 80 # default port

        self.consumer = consumer

        self.status = None
        self.header = None

        self.bytes_in = 0
        self.bytes_out = 0

        self.content_type = None
        self.content_length = None
        self.content_encoding = None
        self.transfer_encoding = None

        self.data = ""

        self.chunk_size = None

        self.timestamp = time.time()

        self.extra_headers = extra_headers

        self.create_socket(socket.AF_INET, socket.SOCK_STREAM)
        try:
            self.connect((host, port))
        except socket.error:
            self.consumer.http(0, self, sys.exc_info())
Пример #18
0
def get_outgoing_url(url):
    """
    Bounce a URL off an outgoing URL redirector, such as
    outgoing.prod.mozaws.net.
    """
    if not settings.REDIRECT_URL:
        return url

    parsed_url = urlparse(url)
    url_netloc = parsed_url.netloc

    # This prevents a link like javascript://addons.mozilla.org...
    # being returned unchanged since the netloc matches the
    # safe list see bug 1251023
    if parsed_url.scheme not in ['http', 'https']:
        return '/'

    # No double-escaping, and some domain names are excluded.
    if (url_netloc == urlparse(settings.REDIRECT_URL).netloc or
            url_netloc in settings.REDIRECT_URL_ALLOW_LIST):
        return url

    url = force_bytes(jinja2.utils.Markup(url).unescape())
    sig = hmac.new(settings.REDIRECT_SECRET_KEY,
                   msg=url, digestmod=hashlib.sha256).hexdigest()
    # Let '&=' through so query params aren't escaped.  We probably shouldn't
    # bother to quote the query part at all.
    return '/'.join([settings.REDIRECT_URL.rstrip('/'), sig,
                     urllib.quote(url, safe='/&=')])
Пример #19
0
Файл: sr.py Проект: djs55/ffs
    def create(self, dbg, uri, name, description, configuration):
        u = urlparse.urlparse(uri)
        # sometimes a user can believe that a device exists because
        # they've just created it, but they don't realise that the actual
        # device will be created by a queued udev event. Make the client's
        # life easier by waiting for outstanding udev events to complete.
        code = subprocess.call(["udevadm", "settle"])
        # if that fails then log and continue
        if code != 0:
            log.info("udevadm settle exitted with code %d" % code)

        p = subprocess.Popen(["mkfs.btrfs",
                              u.path,
                              "-f"],
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)
        stdout, stderr = p.communicate()
        if p.returncode != 0:
            raise xapi.storage.api.volume.Unimplemented("mkfs.btrfs failed on %s" % u.path)
        local_uri = self.attach(dbg, uri)
        with open(urlparse.urlparse(local_uri).path + "/.json", "w") as fd:
            meta = {
                "name": name,
                "description": description
            }
            json.dump(meta, fd)
            fd.write("\n")
        self.detach(dbg, local_uri)
        return
Пример #20
0
 def get_object(self):
     user = self.request.user
     manual_redirect_uri = self.request.auth_data.pop('redirect_uri', None)
     manual_redirect_uri = self.get_redirect_uri(manual_redirect_uri)
     if manual_redirect_uri:
         self.request.backend.redirect_uri = manual_redirect_uri
     elif DOMAIN_FROM_ORIGIN:
         origin = self.request.strategy.request.META.get('HTTP_ORIGIN')
         if origin:
             relative_path = urlparse(self.request.backend.redirect_uri).path
             url = urlparse(origin)
             origin_scheme_host = "%s://%s" % (url.scheme, url.netloc)
             location = urljoin(origin_scheme_host, relative_path)
             self.request.backend.redirect_uri = iri_to_uri(location)
     is_authenticated = user_is_authenticated(user)
     user = is_authenticated and user or None
     # skip checking state by setting following params to False
     # it is responsibility of front-end to check state
     # TODO: maybe create an additional resource, where front-end will
     # store the state before making a call to oauth provider
     # so server can save it in session and consequently check it before
     # sending request to acquire access token.
     # In case of token authentication we need a way to store an anonymous
     # session to do it.
     self.request.backend.REDIRECT_STATE = False
     self.request.backend.STATE_PARAMETER = False
     user = self.request.backend.complete(user=user)
     return user
Пример #21
0
 def process_page_links(self, raw_html, url):
     """
     simply extracts html links using awesome beautifulsoup
     """
     beautiful_html = BeautifulSoup(raw_html)
     
     links = [a.get('href') for a in beautiful_html.find_all('a')]
     links = [link for link in links if link is not None]
     
     for link in links:
         link_info = urlparse.urlparse(link)
         
         if not link_info.scheme and not link_info.netloc:
             link = urlparse.urljoin(url, link)
             link_info = urlparse.urlparse(link)
         
         if('http' not in link_info.scheme) : continue
         
         if self.domain not in link_info.netloc:
             if not self.allow_external :
                 continue  # throwing out external link
             else:
                 priority = 2  # insert external link with low priority
         else:
             priority = 1
         self.unparsed_urls.add(link, priority)
Пример #22
0
	def parseImgLinks(self,depth=1):
		url_response = None
		try:
			url_response = urllib2.urlopen(self.scrap_url,timeout=self._timeout)
		except Exception as e:
			print("   [ERROR]: Could not open {0}: {1}".format(self.scrap_url,e.reason))
			return self.img_list

		html_parse = BeautifulSoup(url_response)
		unique_images_found = 0
		total_images_found = 0
		self.visited[self.scrap_url] = 1

		for img in html_parse.findAll('img'):
			try:
				abs_url = urljoin(self.scrap_url,img['src']) if urlparse(img['src']).netloc == "" else img['src']
				if abs_url not in self.img_list:
					self.img_list.add(abs_url)
					unique_images_found += 1
				total_images_found += 1
			except:
				pass

		print("   [Found %d images / %d new]: %s" % (total_images_found,unique_images_found,self.scrap_url))
		if depth > 1:
			for a in html_parse.findAll('a'):
				try:
					if (urlparse(a['href']).netloc == "") or (urlparse(self.scrape_url_orig).netloc == urlparse(a['href']).netloc):
						self.scrap_url = urljoin(self.scrape_url_orig,a['href'])
						if self.scrap_url in self.visited: continue
						self.parseImgLinks(depth - 1)
				except:
					pass
		return self.img_list
Пример #23
0
    def test_that_checks_redirect_using_incorrect_query_values(self, base_url):
        param = {
            'product': 'firefox-31.0',
            'lang': 'kitty_language',
            'os': 'stella'
        }
        response = self._head_request(base_url, params=param)

        assert (requests.codes.not_found == response.status_code,
                self.response_info_failure_message(base_url, param, response))

        parsed_url = urlparse(response.url)

        assert ('http' == parsed_url.scheme, 'Failed to redirect to the correct scheme. %s' %
                self.response_info_failure_message(base_url, param, response))

        assert (urlparse(base_url).netloc == parsed_url.netloc,
                self.response_info_failure_message(base_url, param, response))

        assert (urlencode(param) == parsed_url.query,
                self.response_info_failure_message(base_url, param, response))

        assert ('Unknown' != self.get_x_backend_server(response),
                'Failed, x-backend-server was not in the response object. %s' %
                self.response_info_failure_message(base_url, param, response))
Пример #24
0
    def get_show(self, imdb, tvdb, show, show_alt, year):
        try:
            query = self.search_link
            post = urllib.urlencode({'searchquery': show, 'searchin': '2'})

            result = ''
            links = [self.link_1, self.link_3]
            for base_link in links:
                result = client.source(urlparse.urljoin(base_link, query), post=post, headers=self.headers)
                if 'widget search-page' in str(result): break

            result = client.parseDOM(result, "div", attrs = { "class": "widget search-page" })[0]
            result = client.parseDOM(result, "td")

            shows = [cleantitle.tv(show), cleantitle.tv(show_alt)]
            years = ['(%s)' % str(year), '(%s)' % str(int(year)+1), '(%s)' % str(int(year)-1)]
            result = [(client.parseDOM(i, "a", ret="href")[-1], client.parseDOM(i, "a")[-1]) for i in result]
            result = [i for i in result if any(x == cleantitle.tv(i[1]) for x in shows)]
            result = [i[0] for i in result if any(x in i[1] for x in years)][0]

            url = client.replaceHTMLCodes(result)
            try: url = urlparse.parse_qs(urlparse.urlparse(url).query)['u'][0]
            except: pass
            url = urlparse.urlparse(url).path
            url = url.encode('utf-8')
            return url
        except:
            return
    def sources(self, url, hostDict, hostprDict):
        try:
            sources = []

            if url == None: return sources

            url = urlparse.urljoin(self.base_link, url)

            r = proxy.request(url, 'tv shows')

            links = client.parseDOM(r, 'a', ret='href', attrs = {'target': '.+?'})
            links = [x for y,x in enumerate(links) if x not in links[:y]]

            for i in links:
                try:
                    url = i
                    url = proxy.parse(url)
                    url = urlparse.parse_qs(urlparse.urlparse(url).query)['r'][0]
                    url = url.decode('base64')
                    url = client.replaceHTMLCodes(url)
                    url = url.encode('utf-8')

                    host = re.findall('([\w]+[.][\w]+)$', urlparse.urlparse(url.strip().lower()).netloc)[0]
                    if not host in hostDict: raise Exception()
                    host = host.encode('utf-8')

                    sources.append({'source': host, 'quality': 'SD', 'language': 'en', 'url': url, 'direct': False, 'debridonly': False})
                except:
                    pass

            return sources
        except:
            return sources
Пример #26
0
    def get_download_url_ssl(self):
        """
        SSL-enabled links should be used for the specific verions, except the
        Windows stub installers.
        """

        # SSL-enabled links won't be used for 26.0
        url = firefox_details.get_download_url("OS X", "pt-BR", "26.0")
        self.assertListEqual(
            parse_qsl(urlparse(url).query), [("product", "firefox-26.0"), ("os", "osx"), ("lang", "pt-BR")]
        )

        # SSL-enabled links won't be used for 27.0 Windows builds (but SSL
        # download is enabled by default for stub installers)
        url = firefox_details.get_download_url("Windows", "pt-BR", "27.0")
        self.assertListEqual(
            parse_qsl(urlparse(url).query), [("product", "firefox-27.0"), ("os", "win"), ("lang", "pt-BR")]
        )

        # SSL-enabled links will be used for 27.0 OS X builds
        url = firefox_details.get_download_url("OS X", "pt-BR", "27.0")
        self.assertListEqual(
            parse_qsl(urlparse(url).query), [("product", "firefox-27.0-SSL"), ("os", "osx"), ("lang", "pt-BR")]
        )

        # SSL-enabled links will be used for 27.0 Linux builds
        url = firefox_details.get_download_url("Linux", "pt-BR", "27.0")
        self.assertListEqual(
            parse_qsl(urlparse(url).query), [("product", "firefox-27.0-SSL"), ("os", "linux"), ("lang", "pt-BR")]
        )
    def mangle_url(self, url):
        self.check_connection()

        try:
            endpoint_url = urlparse.urlparse(url)
        except Exception as e:
            script_unknown("you must provide an endpoint_url in the form"
                           + "<scheme>://<url>/ (%s)\n" % e)
        scheme = endpoint_url.scheme
        if scheme is None:
            script_unknown("you must provide an endpoint_url in the form"
                           + "<scheme>://<url>/ (%s)\n" % e)
        catalog_url = None
        try:
            catalog_url = urlparse.urlparse(
                self.nova_client.client.management_url)
        except Exception as e:
            script_unknown("unknown error parsing the catalog url : %s\n" % e)

        port = endpoint_url.port
        if port is None:
            if catalog_url.port is None:
                port = 8774
            else:
                port = catalog_url.port

        netloc = "%s:%i" % (endpoint_url.hostname, port)
        url = urlparse.urlunparse([scheme,
                                   netloc,
                                   catalog_url.path,
                                   catalog_url.params,
                                   catalog_url.query,
                                   catalog_url.fragment])
        self.nova_client.client.set_management_url(url)
Пример #28
0
def generateUrls(url):
	baseulp = urlparse(url)
	host = baseulp.netloc

	paths = getCrawlerPaths(url)
	#pprint(paths)
	
	urls = []
	rulefile = BASEDIR + '/lib/db/compresed_file.rule'
	for eachpath in paths:
		eachulp = urlparse(eachpath)
		if eachulp.path == '':
			host = eachulp.netloc
			domain = GetFirstLevelDomain(host)
			args = {'host':host,'com':domain}

		else:
			pos = eachulp.path.rfind('/')
			tmp = eachulp.path[pos+1:]
			args = {'com':tmp}

		rf = RuleFile(rulefile,args)
		rf._getRules()
		for i in rf.ret:
			urls.append(eachpath + '/' +i)

	ret = list(set(urls))
	ret.sort()
	return ret
Пример #29
0
def get_all_href_list(root_my_url, soup, file_encode):

    root_parse = urlparse.urlparse(root_my_url.get_abs_url())
    href_list = []

    if not root_parse.hostname:
        return href_list

    # get tags' href
    tag_list = soup.find_all(['a', 'img', 'link'])
    href_filter = r'#|\n|(mailto:)'

    for tag in tag_list:
        add_my_url = DownloadUrl(None, None, root_my_url.get_abs_path())

        if tag.get('href') and not re.search(href_filter, tag.get('href')):
            add_my_url.url = tag.get('href')
        elif tag.get('src'):
            add_my_url.url = tag.get('src')

        if add_my_url.url:
            temp_parse = urlparse.urlparse(add_my_url.url)
            if temp_parse.hostname:
                add_my_url.host = temp_parse.hostname
            else:
                add_my_url.host = root_parse.hostname
            href_list.append(add_my_url)

    return href_list
Пример #30
0
    def do_POST(self):
        try:
            refer = self.headers.getheader('Referer')
            netloc = urlparse.urlparse(refer).netloc
            if not netloc.startswith("127.0.0.1") and not netloc.startswitch("localhost"):
                xlog.warn("web control ref:%s refuse", netloc)
                return
        except:
            pass

        xlog.debug ('GAEProxy web_control %s %s %s ', self.address_string(), self.command, self.path)
        try:
            ctype, pdict = cgi.parse_header(self.headers.getheader('content-type'))
            if ctype == 'multipart/form-data':
                self.postvars = cgi.parse_multipart(self.rfile, pdict)
            elif ctype == 'application/x-www-form-urlencoded':
                length = int(self.headers.getheader('content-length'))
                self.postvars = urlparse.parse_qs(self.rfile.read(length), keep_blank_values=1)
            else:
                self.postvars = {}
        except:
            self.postvars = {}

        path = urlparse.urlparse(self.path).path
        if path == '/deploy':
            return self.req_deploy_handler()
        elif path == "/config":
            return self.req_config_handler()
        elif path == "/scan_ip":
            return self.req_scan_ip_handler()
        elif path.startswith("/importip"):
            return self.req_importip_handler()
        else:
            self.wfile.write(b'HTTP/1.1 404\r\nContent-Type: text/plain\r\nConnection: close\r\n\r\n404 Not Found')
            xlog.info('%s "%s %s HTTP/1.1" 404 -', self.address_string(), self.command, self.path)
__PathdirtyDataFile = '/home/kostas/AraxniProject/scripts/spiderOutput.csv'
__PathcleanDataFile = '/home/kostas/AraxniProject/scripts/sqlErrorCheck/input.txt' #pass to sqlErrorCheck.py
goodLinks = []
index = 0
idx=0

print 'Starting IceFilter, long waiting time process.',
sys.stdout.flush()

for inputLink in open(__PathdirtyDataFile,'r').readlines():
    regexBanned = re.search(r'google|facebook.com|youtube.com|yahoo.com|baidu.com|wikipedia.org|live.com|twitter.com|qq.com|msn.com|yahoo.co.jp|linkedin.com|taobao.com|google.co.in|sina.com.cn|amazon.com|wordpress.com|google.com.hk|google.de|bing.com|google.co.uk|yandex.ru|ebay.com|163.com|google.co.jp|google.fr|microsoft.com|paypal.com|google.com.br|mail.ru|craigslist.org|fc2.com|google.it|apple.com|google.es|imdb.com|google.ru|weibo.com|vkontakte.ru|sohu.com|bbc.co.uk|ask.com|tumblr.com|livejasmin.com|xvideos.com|go.com|youku.com|bp.blogspot.com|cnn.com|soso.com|google.ca|aol.com|tudou.com|xhamster.com|ifeng.com|megaupload.com|mediafire.com|zedo.com|ameblo.jp|pornhub.com|google.co.id|godaddy.com|adobe.com|about.com|rakuten.co.jp|espn.go.com|alibaba.com|conduit.com|ebay.de|4shared.com|wordpress.org|livejournal.com|google.com.mx|google.com.tr|livedoor.com|yieldmanager.com|google.com.au|blogger.com|youporn.com|renren.com|cnet.com|uol.com.br|google.pl|myspace.com|ebay.co.uk|chinaz.com|nytimes.com|thepiratebay.org|doubleclick.com',inputLink)
    if regexBanned is None:
        flagStepOver = False
        for index,goodLink in enumerate(goodLinks):
            inputLinkNetloc = urlparse(inputLink).netloc
            goodLinkNetloc = urlparse(goodLink).netloc
            if inputLinkNetloc == goodLinkNetloc:
                goodlinkQuery = urlparse(goodLink).query
                inputLinkQuery = urlparse(inputLink).query
                if len(inputLinkQuery) > len(goodlinkQuery):    #replace link
                    idx = idx - 1
                    del goodLinks[idx]
                    goodLinks.append(inputLink)
                    print '[',inputLink,'] inserting/replacing...'
                    idx = idx + 1
                    flagStepOver = True #gia na min ektelesti to if sto telos
                else:
                    #print '         (len)inputLinkQuery < (len)goodlinkQuery DO NOTHING'
                    flagStepOver = True
        if flagStepOver == False:
Пример #32
0
    def get_canonical_string(self, url, headers, method):
        parsedurl = urlparse(url)
        objectkey = parsedurl.path[1:]
        query_args = sorted(parsedurl.query.split('&'))

        bucket = parsedurl.netloc[:-len(self.service_base_url)]
        if len(bucket) > 1:
            # remove last dot
            bucket = bucket[:-1]

        interesting_headers = {
            'content-md5': '',
            'content-type': '',
            'date': ''
        }
        for key in headers:
            lk = key.lower()
            try:
                lk = lk.decode('utf-8')
            except:
                pass
            if headers[key] and (lk in interesting_headers.keys()
                                 or lk.startswith('x-amz-')):
                interesting_headers[lk] = headers[key].strip()

        # If x-amz-date is used it supersedes the date header.
        if not py3k:
            if 'x-amz-date' in interesting_headers:
                interesting_headers['date'] = ''
        else:
            if 'x-amz-date' in interesting_headers:
                interesting_headers['date'] = ''

        buf = '%s\n' % method
        for key in sorted(interesting_headers.keys()):
            val = interesting_headers[key]
            if key.startswith('x-amz-'):
                buf += '%s:%s\n' % (key, val)
            else:
                buf += '%s\n' % val

        # append the bucket if it exists
        if bucket != '':
            buf += '/%s' % bucket

        # add the objectkey. even if it doesn't exist, add the slash
        buf += '/%s' % objectkey

        params_found = False

        # handle special query string arguments
        for q in query_args:
            k = q.split('=')[0]
            if k in self.special_params:
                buf += '&' if params_found else '?'
                params_found = True

                try:
                    k, v = q.split('=', 1)

                except ValueError:
                    buf += q

                else:
                    # Riak CS multipart upload ids look like this, `TFDSheOgTxC2Tsh1qVK73A==`,
                    # is should be escaped to be included as part of a query string.
                    #
                    # A requests mp upload part request may look like
                    # resp = requests.put(
                    #     'https://url_here',
                    #     params={
                    #         'partNumber': 1,
                    #         'uploadId': 'TFDSheOgTxC2Tsh1qVK73A=='
                    #     },
                    #     data='some data',
                    #     auth=S3Auth('access_key', 'secret_key')
                    # )
                    #
                    # Requests automatically escapes the values in the `params` dict, so now
                    # our uploadId is `TFDSheOgTxC2Tsh1qVK73A%3D%3D`,
                    # if we sign the request with the encoded value the signature will
                    # not be valid, we'll get 403 Access Denied.
                    # So we unquote, this is no-op if the value isn't encoded.
                    buf += '{key}={value}'.format(key=k, value=unquote(v))

        return buf
Пример #33
0
 def get_host_ip_addr():
     xs_url = urlparse.urlparse(FLAGS.xenapi_connection_url)
     return xs_url.netloc
Пример #34
0
Файл: 3.py Проект: cnxuan05/cms
def parse_qs():
    url = 'https://www.google.com.hk/#newwindow=1&safe=strict&q=bekk-garch%E6%A8%A1%E5%9E%8B'
    result = urlparse.urlparse(url)
    print(result)
    pass
Пример #35
0
def signOut():
	print 'Signing out user'
	response.delete_cookie("account")
	redirect(urlparse.parse_qs(urlparse.urlparse(request.url).query)['redirectUrl'][0])
Пример #36
0
def getSignin():
    try:
        redirectUrl = urlparse.parse_qs(urlparse.urlparse(request.url).query)['redirectUrl'][0]
    except:
        redirectUrl = 'home'
    return bottle.template('signin', redirectUrl=redirectUrl)    
Пример #37
0
def getRegistration():
	try:
		redirectUrl = urlparse.parse_qs(urlparse.urlparse(request.url).query)['redirectUrl'][0]
	except:
		redirectUrl = 'home'
	return bottle.template('register', redirectUrl = redirectUrl)
Пример #38
0
    def get_data(self, already_downloaded_sources):
        """
        Descarga los archivos webbug.log y almacena las detecciones en el archivo wb-[servidor]-[nombreusuario]-events.log
        """
        self.fetched_data = []
        if not self.enabled:
            return False

        for ftpsite in self.__webbug_log:

            addr = urlparse.urlparse(ftpsite)

            url_downloaded = [
                downloaded[0] for downloaded in already_downloaded_sources
            ]
            sremoteaddr = addr.scheme + "://" + addr.hostname + addr.path
            if url_downloaded.count(
                    sremoteaddr
            ):  # Si el fichero ya ha sido descargado por otro plugin, se copia
                already_downloaded_file = already_downloaded_sources[
                    url_downloaded.index(sremoteaddr)]
                self.fetched_data.append(already_downloaded_file)
                self.__logger.info('Reusado %s de %s descargado recientemente',
                                   addr.path, addr.hostname)

            else:  # Si no es así, se descarga
                filename = "temp/wb-" + addr.hostname + '-' + self.person + "-access.log"
                localfile = open(filename, 'wb')
                try:
                    connftp = ftplib.FTP(addr.hostname)
                    connftp.login(addr.username, addr.password)
                    connftp.retrbinary('RETR ' + addr.path, localfile.write)
                    self.fetched_data.append([
                        addr.scheme + "://" + addr.hostname + addr.path,
                        filename
                    ])
                    self.__logger.info('Descargado webbug.log de %s',
                                       addr.hostname)

                    void_file = open("temp/void_file", 'wb')
                    void_file.close()
                    void_file = open("temp/void_file", "rb")
                    connftp.storbinary("STOR " + addr.path,
                                       void_file)  # Vaciamos el webbug.log
                    void_file.close()

                    connftp.quit()
                    localfile.close()
                except:
                    self.__logger.error(
                        'Error conectando al servidor [%s] de %s',
                        addr.hostname, self.person)

                localfile.close()

        line_parser = apache_log_parser.make_parser(
            self.__webbug_log_format.decode('string_escape'))
        for fdownloaded in self.fetched_data:

            ftemp_accesslog = open(fdownloaded[1], 'r')
            addr = urlparse.urlparse(fdownloaded[0])
            fevents_filename = "data/wb-" + addr.hostname + '-' + self.person + "-events.log"

            try:  # lee la fecha y hora de la última linea, si es que existe el archivo
                with open(fevents_filename, "rb") as sf:
                    sfirstline = sf.readline()
                    sf.seek(-2, 2)
                    try:
                        while sf.read(1) != "\n":
                            sf.seek(-2, 1)
                        slastline = sf.readline()
                    except IOError:
                        slastline = sfirstline
                last_event_logged = line_parser(slastline)
                last_event_logged_time = last_event_logged[
                    'time_received_datetimeobj']
            except IOError:
                last_event_logged_time = datetime(1, 1, 1, 1, 1, 1)

            fevents = open(fevents_filename, 'a')

            while True:
                linea = ftemp_accesslog.readline()
                if not linea:
                    break
                log_line_data = line_parser(linea)
                referer_url = log_line_data['request_header_referer']
                web_bug_location = log_line_data['request_first_line']
                ref = Referer(referer_url)

                if (last_event_logged_time < log_line_data['time_received_datetimeobj']) and (web_bug_location != "-")\
                        and ((self.__weight_visit > 0) or (ref.medium == 'search')):

                    if ref.search_term is not None:
                        ref_list = unidecode(
                            ref.search_term.decode('utf-8')).replace(
                                "\"", "").split()
                        if eval(self.__eval_expression):
                            fevents.write(linea)
                    else:
                        fevents.write(linea)

            ftemp_accesslog.close()
            fevents.close()
        if self.fetched_data:
            return True
Пример #39
0
    def eval_data(self, time_frame, analyzed_time, given_time, confirmed_ips):
        """
        Devuelve una lista con un elemento por cada uno de los últimos 'check_interval' minutos antes de la hora
        'given_time'. Cada elemento de la lista devuelta contiene el valor acumulado de las detecciones durante
        los 'time_frame' minutos anteriores.
        """
        eval_time = time_frame + analyzed_time
        detect_list = [0] * eval_time
        acum_list = [0] * analyzed_time
        if not self.enabled:
            return acum_list

        time_now_utc = datetime(given_time.year, given_time.month,
                                given_time.day, given_time.hour,
                                given_time.minute)

        line_parser = apache_log_parser.make_parser(
            self.__webbug_log_format.decode('string_escape'))
        for remoteaddr in self.__webbug_log:

            addr = urlparse.urlparse(
                remoteaddr)  # Se obtiene el nombre del fichero de eventos
            filename = "data/wb-" + addr.hostname + '-' + self.person + "-events.log"

            with open(filename, 'r') as f:

                linea = f.readline(
                )  # Detección de zona horaria en la primera linea del log
                if linea:
                    p = re.compile(r"[\+|-]\d\d\d\d\]")
                    tz = p.findall(linea)[0]
                    timezone = timedelta(hours=int(tz[0:3]),
                                         minutes=int(tz[0] + tz[3:5]))

                visiting_ips = []
                while linea:
                    log_line_data = line_parser(linea)
                    current_ip = log_line_data['remote_host']
                    if confirmed_ips.count(current_ip):

                        l = log_line_data['time_received_datetimeobj']
                        line_time_utc = datetime(l.year, l.month, l.day,
                                                 l.hour, l.minute) - timezone

                        if line_time_utc > time_now_utc:
                            break

                        i = int(
                            (time_now_utc - line_time_utc).total_seconds() /
                            60)  # Conversión hora a índice
                        if i < eval_time:
                            ref = Referer(
                                log_line_data['request_header_referer'])
                            origin = urlparse.urlparse(
                                log_line_data['request_first_line'])
                            if (ref.medium == 'search') and (
                                    ref.search_term
                                    is not None):  # Una búsqueda con términos
                                detect_list[eval_time - i - 1] += self.__weight
                            elif (ref.medium == 'search') and (
                                    ref.search_term is
                                    None):  # Una búsqueda sin términos
                                detect_list[eval_time - i -
                                            1] += self.__weight_no_search_terms
                            elif (self.__weight_visit > 0) and \
                                    (not visiting_ips.count([current_ip, origin.hostname])):  # Una simple visita
                                visiting_ips.append([
                                    current_ip, origin.hostname
                                ])  # Solo puntuan una vez por ip/origen
                                detect_list[eval_time - i -
                                            1] += self.__weight_visit

                    linea = f.readline()

                for i in range(
                        1, analyzed_time + 1
                ):  # Acumulacción de pesos de detección para los rangos dados
                    #print "acumulado", analyzed_time - i, "= suma desde",  eval_time - time_frame - i, "hasta", eval_time - i, "=", detect_list[eval_time - time_frame - i:eval_time - i + 1], "=", sum(detect_list[eval_time - time_frame - i:eval_time - i])
                    acum_list[analyzed_time - i] = sum(
                        detect_list[eval_time - time_frame - i:eval_time - i +
                                    1])

        return acum_list
Пример #40
0
 def parse_endpoint(endpoint):
     return urlparse.urlparse(endpoint)
Пример #41
0
 def test_home_page(self):
     response = self.client.get(url_for('main.index'))
     self.assertEqual(
         urlparse(response.location).path, url_for('auth.login'))
Пример #42
0
    def get_report_data(self, time_frame, given_time, confirmed_ips):
        """
        Devuelve una lista con cada una de las detecciones durante los 'time_frame' minutos previos a
        la hora 'given_time'. Cada elemento contiene la hora de la detección, el sitio donde se detectó,
        la ip del footprinter, la puntuación y un texto descriptivo sobre la misma.
        """
        report_list = []
        if not self.enabled:
            return report_list
        delta_frame = timedelta(minutes=time_frame)

        line_parser = apache_log_parser.make_parser(
            self.__webbug_log_format.decode('string_escape'))
        for remoteaddr in self.__webbug_log:

            addr = urlparse.urlparse(
                remoteaddr)  # Se obtiene el nombre del fichero de eventos
            filename = "data/wb-" + addr.hostname + '-' + self.person + "-events.log"

            with open(filename, 'r') as f:

                linea = f.readline(
                )  # Detección de zona horaria en la primera linea del log
                if linea:
                    p = re.compile(r"[\+|-]\d\d\d\d\]")
                    tz = p.findall(linea)[0]
                    timezone = timedelta(hours=int(tz[0:3]),
                                         minutes=int(tz[0] + tz[3:5]))

                simple_visits = []
                while True:
                    if not linea:
                        break

                    log_line_data = line_parser(linea)
                    ip = log_line_data['remote_host']

                    if confirmed_ips.count(ip):

                        line_time_utc = log_line_data[
                            'time_received_datetimeobj'] - timezone

                        if line_time_utc > given_time:
                            break
                        if line_time_utc > given_time - delta_frame:
                            origin = log_line_data['request_first_line']
                            ref = Referer(
                                log_line_data['request_header_referer'])
                            origin_hostname = urlparse.urlparse(
                                origin).hostname
                            if (ref.medium == 'search') and (
                                    ref.search_term
                                    is not None):  # Una búsqueda con términos
                                sterms = ref.search_term.decode('utf-8')
                                sengine = ref.referer.decode('utf-8')
                                description = u'Una busqueda desde [' + sengine + u'] con los terminos: [' + sterms +\
                                              u'] ha llegado a [' + origin + ']'
                                report_list.append([
                                    line_time_utc,
                                    log_line_data['remote_host'], description,
                                    'Web Bug'
                                ])

                            elif (ref.medium == 'search') and (
                                    ref.search_term is
                                    None):  # Una búsqueda sin términos
                                sengine = ref.referer.decode('utf-8')
                                description = u'Una busqueda desde [' + sengine + u'] ha llegado a [' + origin + ']'
                                report_list.append([
                                    line_time_utc,
                                    log_line_data['remote_host'], description,
                                    'Web Bug'
                                ])

                            elif (self.__weight_visit >
                                  0) and (not simple_visits.count(
                                      [ip, origin_hostname])):
                                simple_visits.append([ip, origin_hostname
                                                      ])  # Una simple visita
                                description = u'Una visita ha llegado a [' + origin + ']'
                                report_list.append([
                                    line_time_utc,
                                    log_line_data['remote_host'], description,
                                    'Web Bug'
                                ])

                    linea = f.readline()

        if report_list:
            return sorted(report_list, key=itemgetter(0))
        else:
            return report_list
Пример #43
0
def keywords(stmt):
    kw = {k.arg: k.value.s for k in stmt.keywords if k.arg in KEYS}
    path = kw.get("importpath", kw.get("remote"))

    u = urlparse(path)
    return u.netloc + u.path, kw["name"]
Пример #44
0
def install_commands_ubuntu(package_name, distribution, package_source,
                            base_url):
    """
    Install Flocker package on Ubuntu.

    The ClusterHQ repo is added for downloading latest releases.  If
    ``package_source`` contains a branch, then a BuildBot repo will also
    be added to the package search path, to use in-development packages.
    Note, the ClusterHQ repo is always enabled, to provide dependencies.

    :param bytes distribution: The distribution the node is running.
    :param PackageSource package_source: The source from which to install the
        package.
    :param base_url: URL of repository, or ``None`` if we're not using
        development branch.

    :return: a sequence of commands to run on the distribution
    """
    flocker_version = package_source.version
    if not flocker_version:
        # support empty values other than None, as '' sometimes used to
        # indicate latest version, due to previous behaviour
        flocker_version = get_installable_version(version)
    commands = [
        # Minimal images often have cleared apt caches and are missing
        # packages that are common in a typical release.  These commands
        # ensure that we start from a good base system with the required
        # capabilities, particularly that the add-apt-repository command
        # is available, and HTTPS URLs are supported.
        run_from_args(["apt-get", "update"]),
        run_from_args([
            "apt-get", "-y", "install", "apt-transport-https",
            "software-properties-common"
        ]),

        # Add ClusterHQ repo for installation of Flocker packages.
        run(command='add-apt-repository -y "deb {} /"'.format(
            get_repository_url(distribution=distribution,
                               flocker_version=flocker_version)))
    ]

    if base_url is not None:
        # Add BuildBot repo for running tests
        commands.append(
            run_from_args(
                ["add-apt-repository", "-y", "deb {} /".format(base_url)]))
        # During a release, the ClusterHQ repo may contain packages with
        # a higher version number than the Buildbot repo for a branch.
        # Use a pin file to ensure that any Buildbot repo has higher
        # priority than the ClusterHQ repo.  We only add the Buildbot
        # repo when a branch is specified, so it wil not interfere with
        # attempts to install a release (when no branch is specified).
        buildbot_host = urlparse(package_source.build_server).hostname
        commands.append(
            put(
                dedent('''\
            Package: *
            Pin: origin {}
            Pin-Priority: 700
        '''.format(buildbot_host)), '/tmp/apt-pref'))
        commands.append(
            run_from_args(
                ['mv', '/tmp/apt-pref',
                 '/etc/apt/preferences.d/buildbot-700']))

    # Update to read package info from new repos
    commands.append(run_from_args(["apt-get", "update"]))

    os_version = package_source.os_version()

    if os_version:
        # Set the version of the top-level package
        package_name += '=%s' % (os_version, )

        # If a specific version is required, ensure that the version for
        # all ClusterHQ packages is consistent.  This prevents conflicts
        # between the top-level package, which may depend on a lower
        # version of a dependency, and apt, which wants to install the
        # most recent version.  Note that this trumps the Buildbot
        # pinning above.
        commands.append(
            put(
                dedent('''\
            Package: clusterhq-*
            Pin: version {}
            Pin-Priority: 900
        '''.format(os_version)), '/tmp/apt-pref'))
        commands.append(
            run_from_args([
                'mv', '/tmp/apt-pref', '/etc/apt/preferences.d/clusterhq-900'
            ]))

    # Install package and all dependencies
    commands.append(
        run_from_args(
            ['apt-get', '-y', '--force-yes', 'install', package_name]))

    return sequence(commands)
Пример #45
0
    parser = argparse.ArgumentParser()
    parser.add_argument('db', help='leveldb root directory')
    args = parser.parse_args(sys.argv[1:])

    db = leveldb.LevelDB(args.db)

    stats = defaultdict(lambda: defaultdict(int))

    header = None
    domain = None
    # valid_languages = [l.lower() for l in args.lang]

    for line in sys.stdin:
        if line.startswith(magic_number):
            header = parse_line(line)
            domain = urlparse(uri).netloc
            continue

        lang, percent, confidence = line.split()

        percent = int(percent)
        if percent < args.minpercent:
            continue

        if valid_languages and lang.lower() not in valid_languages:
            continue

        bytes_in_lang = header["bytes"] * percent / 100
        if bytes_in_lang >= args.minbytes:
            stats[full_domain][lang] += bytes_in_lang
Пример #46
0
        'USER': '******',
        'PASSWORD': '******',
        'HOST': '',
        'PORT': '',
    }
}

urlparse.uses_netloc.append('postgres')
urlparse.uses_netloc.append('mysql')

try:
    if 'DATABASES' not in locals():
        DATABASES = {}

    if 'DATABASE_URL' in os.environ:
        url = urlparse.urlparse(os.environ['DATABASE_URL'])

        # Ensure default database exists.
        DATABASES['default'] = DATABASES.get('default', {})

        # Update with environment configuration.
        DATABASES['default'].update({
            'NAME': url.path[1:],
            'USER': url.username,
            'PASSWORD': url.password,
            'HOST': url.hostname,
            'PORT': url.port,
        })
        if url.scheme == 'postgres':
            DATABASES['default'][
                'ENGINE'] = 'django.db.backends.postgresql_psycopg2'
Пример #47
0
    def sources(self, url, hostDict, hostprDict):
        try:
            sources = []

            if url == None: return sources

            if debrid.status() == False: raise Exception()

            data = urlparse.parse_qs(url)
            data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data])

            try:
                if not 'tvshowtitle' in data: raise Exception()

                links = []

                f = ['S%02dE%02d' % (int(data['season']), int(data['episode']))]
                t = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', '', data['tvshowtitle'])
                t = t.replace("&", "")
                
                q = self.search_link + urllib.quote_plus('%s %s' % (t, f[0]))
                
                q = urlparse.urljoin(self.base_link, q)
                result = client.request(q)
                result = json.loads(result)

                result = result['results']
            except:
                links = result = []
            
            for i in result:
                try:
                    if not cleantitle.get(t) == cleantitle.get(i['showName']): raise Exception()

                    y = i['release']
                    y = re.compile('[\.|\(|\[|\s](\d{4}|S\d*E\d*)[\.|\)|\]|\s]').findall(y)[-1]
                    y = y.upper()
                    if not any(x == y for x in f): raise Exception()

                    quality = i['quality']
                    
                    
                    
                    quality = quality.upper()

                    size = i['size']
                    size = float(size)/1024
                    size = '%.2f GB' % size
   
                    if any(x in quality for x in ['HEVC', 'X265', 'H265']): info = '%s | HEVC' % size
                    else: info = size

                    if '1080P' in quality: quality = '1080p'
                    elif '720P' in quality: quality = 'HD'
                    else: quality = 'SD'

                    url = i['links']
                    #for x in url.keys(): links.append({'url': url[x], 'quality': quality, 'info': info})
                    
                    links = []
                    
                    for x in url.keys(): links.append({'url': url[x], 'quality': quality})
                    
                    for link in links:
                        try:
                            url = link['url']
                            quality2 = link['quality']
                            #url = url[1]
                            #url = link
                            if len(url) > 1: raise Exception()
                            url = url[0].encode('utf-8')
                            
                            host = re.findall('([\w]+[.][\w]+)$', urlparse.urlparse(url.strip().lower()).netloc)[0]
                            if not host in hostprDict: raise Exception()
                            host = host.encode('utf-8')

                            sources.append({'source': host, 'quality': quality2, 'language': 'en', 'url': url, 'info': info, 'direct': False, 'debridonly': True})
                        except:
                            pass
                    
                except:
                    pass


            return sources
        except:
            return sources
Пример #48
0
def download(args):
    if args.arch:
        config.arch = args.arch

    installed_build = builds.get_installed_build()

    def build_suffix(build):
        if build > installed_build:
            symbol = '+'
        elif build < installed_build:
            symbol = '-'
        else:
            symbol = '='
        return symbol

    build_sources = sources.build_sources()

    if args.source:
        source_name = args.source
        try:
            build_source = build_sources[source_name]
        except KeyError:
            parsed = urlparse(source_name)
            if parsed.scheme in ('http', 'https') and parsed.netloc:
                if args.releases:
                    build_url = builds.BuildsURL(
                        source_name, extractor=builds.ReleaseLinkExtractor)
                else:
                    build_url = builds.BuildsURL(source_name)
            else:
                print(
                    '"{}" is not in the list of available sources '
                    'and is not a valid HTTP URL').format(args.source)
                print 'Valid options are:\n\t{}'.format("\n\t".join(
                    build_sources.keys()))
                sys.exit(1)
    else:
        source_name = get_choice(build_sources.keys())
        build_source = build_sources[source_name]

    print
    print "Arch: {}".format(config.arch)
    print "Installed build: {}".format(installed_build)

    try:
        links = build_source.builds()
    except requests.RequestException as e:
        print str(e)
    except builds.BuildURLError as e:
        print str(e)
    else:
        if links:
            build = get_choice(links, build_suffix, reverse=True)
            remote = build.remote_file()
            file_path = os.path.join(libreelec.UPDATE_DIR, build.filename)
            print
            print "Downloading {0} ...".format(build.url)
            try:
                with open(file_path, 'w') as out:
                    process(remote, out, build.size)
            except KeyboardInterrupt:
                os.remove(file_path)
                print
                print "Download cancelled"
                sys.exit()

            if build.compressed:
                tar_path = os.path.join(libreelec.UPDATE_DIR, build.tar_name)
                size = os.path.getsize(file_path)
                print
                print "Decompressing {0} ...".format(file_path)
                with open(file_path, 'r') as fin, open(tar_path, 'w') as fout:
                    process(fin, fout, size, decompress)
                os.remove(file_path)

            funcs.create_notify_file(source_name, build)

            print
            print "The update is ready to be installed. Please reboot."
        else:
            print
            print "No builds available"
    def run(self):
        def download(): return []
        result = cache.bennu_download_get(download, 600000000, table='rel_dl')

        for item in result:
            self.name = item['name'] ; self.image = item['image'] ; self.url = item['url']

            sysname = self.name.translate(None, '\/:*?"<>|').strip('.')

            url = self.url.split('|')[0]
            try: headers = dict(urlparse.parse_qsl(self.url.rsplit('|', 1)[1]))
            except: headers = dict('')

            ext = os.path.splitext(urlparse.urlparse(url).path)[1][1:].lower()

            hdlr = re.compile('.+? ([(]\d{4}[)]|S\d*E\d*)$').findall(self.name)
            if len(hdlr) == 0: self.content = 'Uncategorised'

            if ext in ['m4a', 'mp3', 'aac']: self.content = 'Music'

            hdlr = re.compile('.+? (S\d*E\d*)$').findall(self.name)
            if len(hdlr) > 0: self.content = 'TVShows'

            hdlr = re.compile('.+? [(](\d{4})[)]$').findall(self.name)
            if len(hdlr) > 0: self.content = 'Movies'

            if self.content == 'Movies':
                dest = os.path.join(downloadPath, self.content)
                control.makeFile(dest)
                dest = os.path.join(dest, sysname)
                control.makeFile(dest)

            elif self.content == 'TVShows':
                d = re.compile('(.+?) S(\d*)E(\d*)$').findall(sysname)[0]
                dest = os.path.join(downloadPath, self.content)
                control.makeFile(dest)
                dest = os.path.join(dest, d[0])
                control.makeFile(dest)
                dest = os.path.join(dest, 'Season %01d' % int(d[1]))
                control.makeFile(dest)

            else:
                dest = os.path.join(downloadPath, self.content)
                control.makeFile(dest)


            if not ext in ['mp4', 'm4a', 'mp3', 'aac', 'mkv', 'flv', 'avi', 'mpg']: ext = 'mp4'

            dest = os.path.join(dest, sysname + '.' + ext)

            control.infoDialog(self.name + ' Is Downloading', 'Downloads Started', self.image, time=7000)

            try:
                req = urllib2.Request(url, headers=headers)
                resp = urllib2.urlopen(req, timeout=30)
            except Exception,e:
                removeDownload(self.url)
                print '%s ERROR - File Failed To Open' % (dest)
                continue

            try: self.size = int(resp.headers['Content-Length'])
            except: self.size = 0

            if self.size < 1:
                removeDownload(self.url)
                print '%s Unknown filesize - Unable to download' % (dest)
                continue

            try:  resumable = 'bytes' in resp.headers['Accept-Ranges'].lower()
            except: resumable = False

            size = 1024 * 1024
            if self.size < size: size = self.size

            gb = '%.2f GB' % (float(self.size) / 1073741824)

            start = time.clock()

            total = 0 ; notify = 0 ; errors = 0 ; count = 0 ; resume = 0 ; sleep = 0

            self.clear()

            control.window.setProperty(property + '.status', 'downloading')
            control.window.setProperty(property + '.name', str(self.name))
            control.window.setProperty(property + '.image', str(self.image))
            control.window.setProperty(property + '.size', str(gb))

            f = control.openFile(dest, 'wb')

            chunk  = None
            chunks = []

            while True:
                downloaded = total
                for c in chunks:
                    downloaded += len(c)

                percent = min(100 * downloaded / self.size, 100)
                
                self.speed = str(int((downloaded / 1024) / (time.clock() - start))) + ' KB/s'
                self.percent = str(percent) + '%'

                control.window.setProperty(property + '.percent', str(self.percent))
                control.window.setProperty(property + '.speed', str(self.speed))

                if percent >= notify:
                    control.infoDialog('Downloaded %s' % self.percent, self.name, self.image, time=5000)
                    notify += 10


                chunk = None
                error = False

                try:        
                    chunk  = resp.read(size)
                    if not chunk:
                        if self.percent < 99:
                            error = True
                        else:
                            while len(chunks) > 0:
                                c = chunks.pop(0)
                                f.write(c)
                                del c

                            f.close()
                            print '%s download complete' % (dest)
                            break

                except Exception, e:
                    print str(e)
                    error = True
                    sleep = 10
                    errno = 0

                    if hasattr(e, 'errno'):
                        errno = e.errno

                    if errno == 10035: # 'A non-blocking socket operation could not be completed immediately'
                        pass

                    if errno == 10054: #'An existing connection was forcibly closed by the remote host'
                        errors = 10 #force resume
                        sleep  = 30

                    if errno == 11001: # 'getaddrinfo failed'
                        errors = 10 #force resume
                        sleep  = 30

                if chunk:
                    errors = 0
                    chunks.append(chunk)
                    if len(chunks) > 5:
                        c = chunks.pop(0)
                        f.write(c)
                        total += len(c)
                        del c

                if error:
                    errors += 1
                    count  += 1
                    print '%d Error(s) whilst downloading %s' % (count, dest)
                    control.sleep(sleep*1000)

                if (resumable and errors > 0) or errors >= 10:
                    if (not resumable and resume >= 50) or resume >= 500:
                        #Give up!
                        print '%s download canceled - too many error whilst downloading' % (dest)
                        break

                    resume += 1
                    errors  = 0
                    if resumable:
                        chunks  = []
                        #create new response
                        print 'Download resumed (%d) %s' % (resume, dest)
                        h = headers ; h['Range'] = 'bytes=%d-' % int(total)
                        try: resp = urllib2.urlopen(urllib2.Request(url, headers=h), timeout=10)
                        except: resp = None
                    else:
                        #use existing response
                        pass

                if control.window.getProperty(property + '.status') == 'stop':
                    control.infoDialog('Process Complete', 'Downloads', time=5000)
                    return self.clear()
Пример #50
0
/usr/lib/spark/bin/spark-submit --conf spark.hadoop.yarn.resourcemanager.connect.max-wait.ms=60000 --conf spark.hadoop.fs.defaultFS=hdfs://ip-172-31-38-180.us-west-2.compute.internal:8020 --conf spark.hadoop.yarn.resourcemanager.address=ip-172-31-38-180.us-west-2.compute.internal:8032 --conf spark.dynamicAllocation.enabled=true --conf spark.shuffle.service.enabled=true --conf spark.dynamicAllocation.minExecutors=1 --conf spark.dynamicAllocation.maxExecutors=18 --conf spark.executor.memory=5g --conf spark.executor.cores=4 --name tape --master yarn --deploy-mode cluster --jars /opt/amazon/superjar/glue-assembly.jar --files /tmp/glue-default.conf,/tmp/glue-override.conf,/opt/amazon/certs/InternalAndExternalAndAWSTrustStore.jks,/opt/amazon/certs/rds-combined-ca-bundle.pem,/tmp/g-ef1db6367ac2ca9900a1ec51e0610890dd85420b-2014450836307118787/script_2018-01-16-19-19-57.py --py-files /tmp/PyGlue.zip --driver-memory 5g --executor-memory 5g /tmp/runscript.py script_2018-01-16-19-19-57.py --JOB_NAME PropertyDim_Full_Refresh --JOB_ID j_50471421e9761b8bb5ab038777ad7d47ca763b820c11175075f667160acf650b --s3_bucket_sql_file move-dataeng-lstp-dev --s3_prefix_sql_file edw/propertydim_temp_scripts/property_dim_dedupe.sql --JOB_RUN_ID jr_9b9536625270bf934b4fd16afb589ac5019e1d1792b9ff228b8f6084ad403e08 --job-bookmark-option job-bookmark-disable --temp_table_name property_dim_All --s3_source_path s3://move-dataeng-lstp-prod/edw/processed-data-xact/property_dim/year=2011/month=04/day=21/hour=07 --TempDir s3://move-dataeng-temp-dev/glue-results/mk/propertyfullrefresh/

"""
# Extract the Glue Job Arguments
args = getResolvedOptions(sys.argv, [
    'JOB_NAME', 's3_source_path', 's3_target_path', 'source_sql_file_path',
    'temp_table_name'
])
print "Job Name is: ", args['JOB_NAME']
print "S3 Source File Path: ", args['s3_source_path']
print "S3 Target File Path: ", args['s3_target_path']
print "Source SQL File Path: ", args['source_sql_file_path']
print "Temp Table Name: ", args['temp_table_name']

#Parse Bucket and Prefix of the SQL File Path
source_sql_path = urlparse(args['source_sql_file_path'])
s3_bucket_sql_file = source_sql_path.netloc
s3_prefix_sql_file = source_sql_path.path.lstrip('/')


# Method to read S3 file as a string
def getStringFromFile(bucket_name, key):
    s3_client = boto3.client('s3', region_name='us-west-2')
    response = s3_client.get_object(Bucket=bucket_name, Key=key)
    data = response['Body'].read()
    return data


#1. Variables

#s3://move-dataeng-lstp-prod/edw/processed-data-xact/property_dim
Пример #51
0
def replaceurl(url, port):
    parsed = urlparse(url)
    newurl = 'https://' + parsed.hostname + ':' + str(port)
    return (newurl)
Пример #52
0
 def GetChangeIdForReview(self, review_url):  # pragma: no cover
     u = urlparse.urlparse(review_url)
     return u.path.split('/')[-1]
Пример #53
0
    def req_config_handler(self):
        req = urlparse.urlparse(self.path).query
        reqs = urlparse.parse_qs(req, keep_blank_values=True)
        data = ''

        appid_updated = False

        try:
            if reqs['cmd'] == ['get_config']:
                data = json.dumps(user_config.user_special, default=lambda o: o.__dict__)
            elif reqs['cmd'] == ['set_config']:
                appids = self.postvars['appid'][0]
                if appids != user_config.user_special.appid:
                    if appids and ip_manager.good_ip_num:
                        fail_appid_list = test_appid.test_appids(appids)
                        if len(fail_appid_list):
                            fail_appid = "|".join(fail_appid_list)
                            return self.send_response_nc('text/html', '{"res":"fail", "reason":"appid fail:%s"}' % fail_appid)

                    appid_updated = True
                    user_config.user_special.appid = appids

                user_config.user_special.proxy_enable = self.postvars['proxy_enable'][0]
                user_config.user_special.proxy_type = self.postvars['proxy_type'][0]
                user_config.user_special.proxy_host = self.postvars['proxy_host'][0]
                user_config.user_special.proxy_port = self.postvars['proxy_port'][0]
                try:
                    user_config.user_special.proxy_port = int(user_config.user_special.proxy_port)
                except:
                    user_config.user_special.proxy_port = 0

                user_config.user_special.proxy_user = self.postvars['proxy_user'][0]
                user_config.user_special.proxy_passwd = self.postvars['proxy_passwd'][0]
                user_config.user_special.host_appengine_mode = self.postvars['host_appengine_mode'][0]

                use_ipv6 = int(self.postvars['use_ipv6'][0])
                if user_config.user_special.use_ipv6 != use_ipv6:
                    if use_ipv6:
                        if not check_local_network.check_ipv6():
                            xlog.warn("IPv6 was enabled, but check failed.")
                            return self.send_response_nc('text/html', '{"res":"fail", "reason":"IPv6 fail"}')

                    user_config.user_special.use_ipv6 = use_ipv6

                user_config.save()

                config.load()
                appid_manager.reset_appid()
                import connect_manager
                connect_manager.load_proxy_config()
                connect_manager.https_manager.load_config()
                if appid_updated:
                    http_dispatch.close_all_worker()

                ip_manager.reset()
                check_ip.load_proxy_config()

                data = '{"res":"success"}'
                self.send_response_nc('text/html', data)
                #http_request("http://127.0.0.1:8085/init_module?module=gae_proxy&cmd=restart")
                return
        except Exception as e:
            xlog.exception("req_config_handler except:%s", e)
            data = '{"res":"fail", "except":"%s"}' % e
        self.send_response_nc('text/html', data)
Пример #54
0
    def __init__(self,
                 username=None,
                 password=None,
                 security_token=None,
                 session_id=None,
                 instance=None,
                 instance_url=None,
                 organizationId=None,
                 sandbox=False,
                 version=DEFAULT_API_VERSION,
                 proxies=None,
                 session=None,
                 client_id=None):
        """Initialize the instance with the given parameters.

        Available kwargs

        Password Authentication:

        * username -- the Salesforce username to use for authentication
        * password -- the password for the username
        * security_token -- the security token for the username
        * sandbox -- True if you want to login to `test.salesforce.com`, False
                     if you want to login to `login.salesforce.com`.

        Direct Session and Instance Access:

        * session_id -- Access token for this session

        Then either
        * instance -- Domain of your Salesforce instance, i.e.
          `na1.salesforce.com`
        OR
        * instance_url -- Full URL of your instance i.e.
          `https://na1.salesforce.com

        Universal Kwargs:
        * version -- the version of the Salesforce API to use, for example
                     `29.0`
        * proxies -- the optional map of scheme to proxy server
        * session -- Custom requests session, created in calling code. This
                     enables the use of requests Session features not otherwise
                     exposed by simple_salesforce.

        """

        # Determine if the user passed in the optional version and/or sandbox
        # kwargs
        super(Salesforce, self).__init__(session=session)
        self.sf_version = version
        self.sandbox = sandbox
        self.proxies = self.session.proxies
        # override custom session proxies dance
        if proxies is not None:
            if not session:
                self.session.proxies = self.proxies = proxies
            else:
                logger.warning(
                    'Proxies must be defined on custom session object, '
                    'ignoring proxies: %s', proxies)

        # Determine if the user wants to use our username/password auth or pass
        # in their own information
        if all(arg is not None
               for arg in (username, password, security_token)):
            self.auth_type = "password"

            # Pass along the username/password to our login helper
            self.session_id, self.sf_instance = SalesforceLogin(
                session=self.session,
                username=username,
                password=password,
                security_token=security_token,
                sandbox=self.sandbox,
                sf_version=self.sf_version,
                proxies=self.proxies,
                client_id=client_id)

        elif all(arg is not None
                 for arg in (session_id, instance or instance_url)):
            self.auth_type = "direct"
            self.session_id = session_id

            # If the user provides the full url (as returned by the OAuth
            # interface for example) extract the hostname (which we rely on)
            if instance_url is not None:
                self.sf_instance = urlparse(instance_url).hostname
            else:
                self.sf_instance = instance

        elif all(arg is not None
                 for arg in (username, password, organizationId)):
            self.auth_type = 'ipfilter'

            # Pass along the username/password to our login helper
            self.session_id, self.sf_instance = SalesforceLogin(
                session=self.session,
                username=username,
                password=password,
                organizationId=organizationId,
                sandbox=self.sandbox,
                sf_version=self.sf_version,
                proxies=self.proxies,
                client_id=client_id)

        else:
            raise TypeError(
                'You must provide login information or an instance and token')

        if self.sandbox:
            self.auth_site = 'https://test.salesforce.com'
        else:
            self.auth_site = 'https://login.salesforce.com'

        self.base_url = ('https://{instance}/services/data/v{version}/'.format(
            instance=self.sf_instance, version=self.sf_version))
        self.apex_url = ('https://{instance}/services/apexrest/'.format(
            instance=self.sf_instance))
Пример #55
0
# Patch admin site for stats application
patch(admin.site)
admin.autodiscover()

handler404 = 'shared.views.view_404'
handler500 = 'shared.views.view_500'

urlpatterns = patterns(
    '',
    (r'', include('shared.urls')),
    (r'', include('badges.urls')),
    (r'', include('banners.urls')),
    (r'^accounts/', include('users.urls')),
    (r'^browserid/', include('browserid.urls')),
    (r'^fb/', include('facebook.urls')),
    (r'^admin/', include('smuggler.urls')),
    (r'^admin/', include(admin.site.urls)),
)

## In DEBUG mode, serve media files through Django.
if settings.DEBUG or settings.SERVE_MEDIA:
    # Remove host, leading and trailing slashes so the regex matches.
    media_url = urlparse(settings.MEDIA_URL).path.lstrip('/').rstrip('/')
    urlpatterns += patterns(
        '',
        (r'^%s/(?P<path>.*)$' % media_url, 'django.views.static.serve', {
            'document_root': settings.MEDIA_ROOT
        }),
    )
Пример #56
0
print '%d total emojis in master.' % len(master_list)

# Check if emojis in master is missing from all_emojis
for phrase, img_url in master_list.items():
    #if phrase not in all_emojis and phrase.encode('utf-8') not in all_emojis:
    if not all_emojis.get(phrase) and not all_emojis.get(
            phrase.encode('utf-8')):
        print 'MISSING: ', phrase, img_url

for phrase, img_url in all_emojis.items():
    if args.debug:
        print phrase + '\t' + img_url
    if img_url.startswith(WEIBO_EMOJI_WEBROOT):
        filename = img_url.replace(WEIBO_EMOJI_WEBROOT, '')
    else:
        filename = urlparse.urlparse(img_url).path[1:]

    # to catch img urls like http://img.t.sinajs.cn/t4/appstyle/expression/ext/normal/c8/../e0/hongbao1_org.gif
    filename = re.sub(r'[^/]+/\.\./', '', filename)
    output_filename = os.path.join(args.output_folder, filename)

    # create folder if necessary
    try:
        if not args.simulate:
            if not os.path.exists(os.path.dirname(output_filename)):
                os.makedirs(os.path.dirname(output_filename))

            if not os.path.isfile(output_filename):
                # download if file does not exist
                request = urllib2.Request(
                    img_url, headers={'User-agent': 'Mozilla/5.0'})
Пример #57
0
if len(sys.argv) < 2 or len(sys.argv) > 4:
	print('%s [domain] [output]'%sys.argv[0])
	sys.exit(0)

all_links = []

domain = sys.argv[1]
try:
	filename = sys.argv[2]
except IndexError:
	filename = None

import urlparse 

parse = urlparse.urlparse(domain)
if parse.netloc:
	domain = parse.netloc
elif parse.path != '' and parse.netloc == '':
	domain = parse.path
else: domain = domain

def archive():
	content = requests.get('http://web.archive.org/cdx/search/cdx?url=*.%s/*&output=json&collapse=urlkey'%domain).content
	c = json.loads(content)
	for i in c:
		for b in i:
			if domain in b and b.startswith('http'):
				if b not in all_links:
					all_links.append(b)
Пример #58
0
# Generate a list of machines to use.
machines = []
for machine in options.machines.split(','):
    if machine in ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']:
        for unit in [0, 1, 2, 3]:
            machines.append("sfxc-" + machine + str(unit))
            continue
        pass
    else:
        machines.append(machine)
        pass
    continue

# Select input nodes.
for station in json_input["data_sources"]:
    url = urlparse.urlparse(json_input["data_sources"][station][0])
    if url.netloc:
        if url.port:
            data_socket[station] = url.port
        else:
            data_socket[station] = 8888
            pass
    elif url.path:
        data_socket[station] = os.path.dirname(url.path)
    else:
        data_socket[station] = '/tmp/mk5read'
        pass
    continue

readers = {}
reader_slots = {}
    def sources(self, url, hostDict, hostprDict):
        try:
            sources = []

            if url == None: return sources

            if debrid.status() == False: raise Exception()

            data = urlparse.parse_qs(url)
            data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data])

            title = data['tvshowtitle'] if 'tvshowtitle' in data else data['title']

            hdlr = 'S%02dE%02d' % (int(data['season']), int(data['episode'])) if 'tvshowtitle' in data else data['year']

            query = '%s S%02dE%02d' % (data['tvshowtitle'], int(data['season']), int(data['episode'])) if 'tvshowtitle' in data else '%s %s' % (data['title'], data['year'])
            query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', query)

            url = self.search_link % urllib.quote_plus(query)
            url = urlparse.urljoin(self.base_link, url)

            r = client.request(url)

            posts = client.parseDOM(r, 'item')

            hostDict = hostprDict + hostDict

            items = []

            for post in posts:
                try:
                    t = client.parseDOM(post, 'title')[0]

                    u = re.findall('<p>(http(?:s|)://.+?)</p>', post)
                    items += [(t, i) for i in u]
                except:
                    pass

            for item in items:
                try:
                    name = item[0]
                    name = client.replaceHTMLCodes(name)

                    t = re.sub('(\.|\(|\[|\s)(\d{4}|S\d*E\d*|S\d*|3D)(\.|\)|\]|\s|)(.+|)', '', name)

                    if not cleantitle.get(t) == cleantitle.get(title): raise Exception()

                    y = re.findall('[\.|\(|\[|\s](\d{4}|S\d*E\d*|S\d*)[\.|\)|\]|\s]', name)[-1].upper()

                    if not y == hdlr: raise Exception()

                    fmt = re.sub('(.+)(\.|\(|\[|\s)(\d{4}|S\d*E\d*|S\d*)(\.|\)|\]|\s)', '', name.upper())
                    fmt = re.split('\.|\(|\)|\[|\]|\s|\-', fmt)
                    fmt = [i.lower() for i in fmt]

                    if any(i.endswith(('subs', 'sub', 'dubbed', 'dub')) for i in fmt): raise Exception()
                    if any(i in ['extras'] for i in fmt): raise Exception()

                    if '1080p' in fmt: quality = '1080p'
                    elif '720p' in fmt: quality = 'HD'
                    else: quality = 'SD'
                    if any(i in ['dvdscr', 'r5', 'r6'] for i in fmt): quality = 'SCR'
                    elif any(i in ['camrip', 'tsrip', 'hdcam', 'hdts', 'dvdcam', 'dvdts', 'cam', 'telesync', 'ts'] for i in fmt): quality = 'CAM'

                    info = []

                    if '3d' in fmt: info.append('3D')

                    try:
                        size = re.findall('((?:\d+\.\d+|\d+\,\d+|\d+) [M|G]B)', name)[-1]
                        div = 1 if size.endswith(' GB') else 1024
                        size = float(re.sub('[^0-9|/.|/,]', '', size))/div
                        size = '%.2f GB' % size
                        info.append(size)
                    except:
                        pass

                    if any(i in ['hevc', 'h265', 'x265'] for i in fmt): info.append('HEVC')

                    info = ' | '.join(info)

                    url = item[1]
                    if any(x in url for x in ['.rar', '.zip', '.iso']): raise Exception()
                    url = client.replaceHTMLCodes(url)
                    url = url.encode('utf-8')

                    host = re.findall('([\w]+[.][\w]+)$', urlparse.urlparse(url.strip().lower()).netloc)[0]
                    if not host in hostDict: raise Exception()
                    host = client.replaceHTMLCodes(host)
                    host = host.encode('utf-8')

                    sources.append({'source': host, 'quality': quality, 'language': 'en', 'url': url, 'info': info, 'direct': False, 'debridonly': True})
                except:
                    pass

            check = [i for i in sources if not i['quality'] == 'CAM']
            if check: sources = check

            return sources
        except:
            return sources
from urlparse import urlparse
import sys

for line in sys.stdin:
    data = line.strip().split("GET")
    if len(data) == 2:
        print urlparse(data[1]).path