예제 #1
0
파일: crawler3.py 프로젝트: sigmundc/CS6965
def normalize_url(base_url, url):
	myfile3 = open('normalization_log', 'a')
	myfile3.write("base url:{0}\n".format(base_url))
	myfile3.write("url:{0}\n".format(url))
	myfile3.close()
	result = ''

	# if url starts with http:// or https://
	allowed_scheme = ['http', 'https']
	url_scheme = urlparse(url).scheme
	if url_scheme in allowed_scheme:
		return urlnorm.norm(url)
	elif url_scheme == 'mailto':
		return False
	elif len(url_scheme) == 0:
		# check if URL starts with ../
		if (url[:3] == '../') or (url[:2] == './'):
			return urlnorm.norm(base_url+'/'+url)
		elif url[0] == '/': # e.g. /page/page
			# That means it's the domain + url
			url_obj = urlparse(base_url)
			new_url = url_obj.scheme + "://" + url_obj.netloc + url
			return urlnorm.norm(new_url)

		else: # URL should be just html page e.g. research.html
			# so we need to replace the last part
			# if URL is 'http://www.test.com/page/page/12345':
			# results will be ['http://www.test.com/page/page', '12345']
			parts = base_url.rsplit('/', 1)
			return urlnorm.norm(parts[0]+'/'+url)
	result = url
	return result
예제 #2
0
def main():
	if (len(sys.argv) < 3 ):
		print "usage: python ll-print.py <url> <search term>"
		print "example: python ll-print.py http://www.hunch.com 'hunch team'"
		exit(0)
	root_URL = sys.argv[1]
	search_term = sys.argv[2]
	if (not validate_search_term(search_term)):
		print "Invalid search term.  Please only use valid url characters and spaces."
		exit(1)
	first_letter = search_term[0]
	first_letter_match = root_URL.find(first_letter.lower())
	if (first_letter_match != -1):
		try:
			br = mechanize.Browser()
			br._factory.is_html = True
			result = []
			br.open(root_URL)
			# print "visiting: " + urlnorm.norm(br.geturl())
			visited = set([urlnorm.norm(br.geturl()), urlnorm.norm(root_URL)])
			result = find_matching_links(br, search_term, result, visited)
			if (result):
				max_index = max(result, key=lambda u: u[1])[1]
				for l, i, c in result:
					print_url(l, i, max_index)
		except urlnorm.InvalidUrl:
			print "Invalid root URL"
		except urllib2.URLError, e:
			print "Error opening root URL"
			print e
		except Exception, e:
			print e
예제 #3
0
def crawl(db, url, urls_crawled={}):
    # Make sure we don't infinite loop, keep track of what we urls_crawled
    # TODO pull this from Phoenix

    # Crawl this website, get all of the outbound URLs
    urls_to_crawl = crawl_one(db, url)
    # Record that we crawled this url
    urls_crawled[url] = None

    for url_to_crawl in urls_to_crawl:
        try:
            url_to_crawl = urlnorm.norm(url_to_crawl)
        except urlnorm.InvalidUrl:
            # Try to convert it to an absolute url
            url_to_crawl = urlnorm.norm("%s%s" % (url, url_to_crawl))

        # Don't re-record
        if url_to_crawl in urls_crawled:
            print 'Skipping %s as already crawled' % (url_to_crawl)
        # Only crawl my site
        elif url_to_crawl.startswith('https://penguinsinabox.com'):
            crawl(db, url_to_crawl, urls_crawled)
        else:
            # A website not owned by me
            print 'Skipping %s as not a self-controlled site' % (url_to_crawl)
    print "Finished processing children of %s" % (url)
예제 #4
0
def find_matching_links(br, target_word, result, visited):
	if (not target_word):
		return result
	else:
		current_URL = urlnorm.norm(br.geturl())
		current_letter = target_word[0].lower()
		if (current_letter.isspace()):
			return find_matching_links(br, target_word[1:], result + [('', -1, ' ')], visited)
		else:
			matching_index = current_URL[7:].find(current_letter)
			if (matching_index == -1):
				return []
			else:
				new_result = result + [(current_URL, matching_index + 7, current_letter)]
				links = list(br.links())
				for link in links:
					try:
						link_URL = urlnorm.norm(link.absolute_url)
						if (link_URL not in visited):
							br.open(link_URL)
							new_visited = visited.copy()
							new_visited.add(link_URL)
							# print "visiting: " + urlnorm.norm(br.geturl())
							new_visited.add(urlnorm.norm(br.geturl()))
							child_result = find_matching_links(br, target_word[1:], new_result, new_visited)
							if (child_result):
								return child_result
 					except Exception, e:
						continue
def fetchOutlinks(ahrefs):
    newOutLinks = set()
    base_url = "https://en.wikipedia.org"
    for a in ahrefs:
        try:
            ahref = a['href'].lower()
            not_parseable_ressources = (".avi", ".mkv", ".mp4", ".jpg",
                                        ".jpeg", ".png", ".gif", ".pdf",
                                        ".iso", ".rar", ".tar", ".tgz", ".zip",
                                        ".dmg", ".exe")
            if not urlparse.urlparse(ahref).path.endswith(
                    not_parseable_ressources):
                if "wiki" in ahref:
                    if "#" in ahref:  # Finding and removing URLs with # in them
                        ahref = ahref[:ahref.find("#")]
                        pass
                    elif "?" in ahref:  # Finding and removing URLs with ? in them
                        ahref = ahref[:ahref.find("?")]
                        pass
                    elif ":" in ahref:  # Finding and removing URLs with : in them
                        ahref = ahref[:ahref.find(":")]
                        pass
                    elif "//" in ahref:  # Finding and removing URLs with // in them
                        ahref = ahref[:ahref.find("//")]
                        pass
                    elif ahref == "/wiki/Main_Page":  # Finding and removing URLs of Main page of Wiki
                        pass
                    elif "united" in ahref or "states" in ahref or "u.s" in ahref or "illegal" in ahref or "immig" in ahref or "donald" in ahref or "trump" in ahref:
                        newUrl = a['href']
                        finalUrl = base_url + newUrl
                        finalUrl = urlnorm.norm(finalUrl).encode(
                            "utf-8", "ignore")
                        newOutLinks.add(finalUrl)
                    else:
                        if ahref[:
                                 2] == "//":  # Finding and removing URLs with // in them
                            pass
                        elif "index" in ahref or "youtube" in ahref or "rgu" in ahref or "book" in ahref or "american" in ahref:
                            pass
                        elif "#" in ahref:  # Finding and removing URLs with # in them
                            ahref = ahref[:ahref.find("#")]
                            pass
                        elif "united" in ahref or "states" in ahref or "u.s" in ahref or "illegal" in ahref or "immig" in ahref or "donald" in ahref or "trump" in ahref:
                            newUrl = a['href']
                            finalUrl = newUrl
                            print "outlinks:", finalUrl
                            finalUrl = urlnorm.norm(finalUrl).encode(
                                "utf-8", "ignore")
                            newOutLinks.add(finalUrl)
        except KeyError, e:
            pass
예제 #6
0
    def __init__(self, url, previous=None, **info):
        # Apply the simple idempotent optimizations to all urls (no need to
        # ever deal with "HTTP://.."). This means case-sensitivity, and a
        # whole lot of other things that the urlnorm library will do for us.
        # We call this the original url, even though it is a bit of a lie.
        try:
            self.original_url = urlnorm.norm(url)
        except urlnorm.InvalidUrl as e:
            raise urlnorm.InvalidUrl('{}: {}'.format(e, url))

        # For the normalized url that we'll be exposing, remove the
        # fragment, and treat https and http the same.
        url, fragment = urldefrag(self.original_url)
        self.lossy_url_data = {'fragment': fragment}
        if url.startswith('https:'):
            url = 'http' + url[5:]
            self.lossy_url_data.update({'protocol': 'https'})
        self.url = url

        self.set_previous(previous)
        self.info = info
        self.post = None

        # Runtime data
        self.response = None
        self.exception = None
        self.retries = 0
예제 #7
0
파일: utils.py 프로젝트: jpopelka/gluetool
def treat_url(url, logger=None):
    """
    Remove "weird" artifacts from the given URL. Collapse adjacent '.'s, apply '..', etc.

    :param str url: URL to clear.
    :param gluetool.log.ContextAdapter logger: logger to use for logging.
    :rtype: str
    :returns: Treated URL.
    """

    logger = logger or Logging.get_logger()

    logger.debug("treating a URL '{}'".format(url))

    try:
        url = str(urlnorm.norm(url))

    except urlnorm.InvalidUrl as exc:
        # urlnorm cannot handle localhost: https://github.com/jehiah/urlnorm/issues/3
        if exc.message == "host u'localhost' is not valid":
            pass

        else:
            raise exc

    return url.strip()
예제 #8
0
    def _canonize(self):
        if self.is_absolute:
            self.canonical_scheme, self.canonical_netloc, self.canonical_path, self.canonical_query, self.canonical_fragment \
                = urlsplit(urlnorm.norm(self.raw))
        else:
            self.canonical_scheme, self.canonical_netloc, self.canonical_path, self.canonical_query, self.canonical_fragment \
                = self.raw_scheme, self.raw_netloc, self.raw_path, self.raw_query, self.raw_fragment

        self.canonical_scheme = self.canonical_scheme.lower()

        if self.raw_scheme.endswith('s'):
            self.canonical_netloc, _ = rstrip_string(self.canonical_netloc,
                                                     ':443')
        else:
            self.canonical_netloc, _ = rstrip_string(self.canonical_netloc,
                                                     ':80')

        self.canonical_netloc = self.canonical_netloc.lower()

        self.canonical_path = '' if self.canonical_path == '/' else self.canonical_path

        params = parse_qsl(self.canonical_query, True)
        self.canonical_query_params = [(k, v) for (k, v) in sorted(params)]

        self.canonical = urlunsplit(
            (self.canonical_scheme, self.canonical_netloc, self.canonical_path,
             self.canonical_query, ''))
예제 #9
0
def processPage():
    while not urls.counter > urlcount:
        try:
            link = urlpool.get()
            newurl = urlparse.urljoin(
                link.base_url,
                link.url)  # Converting relative URLs to Absolute ones
            newurl = unicode(urlnorm.norm(newurl))  # Normalizing URL
            print "out: " + newurl
            disassembled = urlparse.urlsplit(newurl)
            filename, file_ext = splitext(
                basename(disassembled.path
                         ))  # Finding file extension for filtering exclusions
            file_ext = file_ext.lower()
            if filename == 'index':
                newurl = newurl[:-len(filename + file_ext)]
            if (file_ext not in excludedExtensions
                    and disassembled.scheme in ['http', 'https']
                    and disassembled.fragment == ''):
                print "in : " + newurl
                if newurl not in visited:  # Checking to see if URL has already been queued once
                    visited.add(newurl)
                    if urlContains(newurl, searchTags) > 0:
                        urls.put(newurl, 1)
                    else:
                        priority = priorityCalculator.searchPage(
                            newurl, searchTags)
                        if priority < len(searchTags) + 1:
                            urls.put(
                                newurl, priority
                            )  # Adding URL to queue with calculated priority
        except UnicodeEncodeError:
            print "UnicodeEncodeError"
        except:
            print "Invalid URL"
예제 #10
0
def processPage():
    while not urls.counter > urlcount:
        try:
            link = urlpool.get()
            newurl = urlparse.urljoin(link.base_url, link.url) # Converting relative URLs to Absolute ones
            newurl = unicode(urlnorm.norm(newurl)) # Normalizing URL
            print "out: " + newurl
            disassembled = urlparse.urlsplit(newurl)
            filename, file_ext = splitext(basename(disassembled.path)) # Finding file extension for filtering exclusions
            file_ext = file_ext.lower()
            if filename == 'index':
                newurl = newurl[:-len(filename + file_ext)]
            if (file_ext not in excludedExtensions and disassembled.scheme in ['http', 'https'] and disassembled.fragment == ''):
                print "in : " + newurl
                if newurl not in visited: # Checking to see if URL has already been queued once
                    visited.add(newurl)
                    if urlContains(newurl, searchTags) > 0:
                        urls.put(newurl, 1)
                    else:
                        priority = priorityCalculator.searchPage(newurl, searchTags)
                        if priority < len(searchTags) + 1:
                            urls.put(newurl, priority) # Adding URL to queue with calculated priority
        except UnicodeEncodeError:
            print "UnicodeEncodeError"
        except:
            print "Invalid URL"
예제 #11
0
def test_invalid_urls(url):
    try:
        output = urlnorm.norm(url)
        print '%r' % output
    except urlnorm.InvalidUrl:
        return
    assert 1 == 0, "this should have raised an InvalidUrl exception"
예제 #12
0
def canonicalize(url):
    """Canonicalize a URL in just a few easy steps:

        1. Resolve any redirects
        2. Normalize the URL
        3. Strip any superflous query params
        4. Sort any remaining query params
        5. Profit!

    This relies on the urlnorm module for normalization, and, at the moment,
    just removes utm_* query params.

    TODO: Special case normalization for major sites (e.g. youtube)?
    """
    url = urlnorm.norm(resolve(url))
    url_parts = urlparse.urlsplit(url)
    scheme, netloc, path, query, fragment = url_parts

    params = []
    for key, value in cgi.parse_qs(query).iteritems():
        if exclude_param(url_parts, key, value):
            continue
        if isinstance(value, list):
            params.extend((key, v) for v in value)
        else:
            params.append((key, value))

    query = urllib.urlencode(sorted(params), doseq=1)
    return urlparse.urlunsplit((scheme, netloc, path, query, ''))
예제 #13
0
def test_invalid_urls(url):
    try:
        output = urlnorm.norm(url)
        print '%r' % output
    except urlnorm.InvalidUrl:
        return
    assert 1 == 0, "this should have raised an InvalidUrl exception"
예제 #14
0
def task_listener_crawler(gearman_worker, gearman_job):
	url = gearman_job.data
	url_frontier.add(url)
	urls = urlparse.urlparse(url)
	print "Crawling ", url
	response = requests.get(url, crawler_headers)
	print 'Downloaded page'
	if response.status_code == 200:
		raw_data = response.text
		if response.encoding != 'utf8':
			raw_data = response.text.encode(response.encoding).decode('utf8')
		r.table(raw_result_table).insert({'url': url, 'raw': raw_data, 'status': 200}, conflict="replace").run(rethink)

		links = linkregex.findall(raw_data)
		for link in (links.pop(0) for _ in xrange(len(links))):
			pre_norm_url = url_pre_norm(link, urls)
			norm_url = urlnorm.norm(pre_norm_url)
			norm_parts = urlparse.urlparse(norm_url)
			ext_url = norm_parts.path.split(".")[-1].lower()
			if ext_url not in except_url_suffixes and url_frontier.add(norm_url):
				print "Add ", norm_url, " to redis queue"
				redis_client.rpush("urls:enqueued", norm_url)
		print "Done"
		return "ok"
	else:
		r.table(raw_result_table).insert({'url': url, 'status': response.status_code}, conflict="replace").run(rethink)
	return "fail"
예제 #15
0
def canonicalize(url):
    """Canonicalize a URL in just a few easy steps:

        1. Resolve any redirects
        2. Normalize the URL
        3. Strip any superflous query params
        4. Sort any remaining query params
        5. Profit!

    This relies on the urlnorm module for normalization, and, at the moment,
    just removes utm_* query params.

    TODO: Special case normalization for major sites (e.g. youtube)?
    """
    url = urlnorm.norm(resolve(url))
    url_parts = urlparse.urlsplit(url)
    scheme, netloc, path, query, fragment = url_parts

    params = []
    for key, value in cgi.parse_qs(query).iteritems():
        if exclude_param(url_parts, key, value):
            continue
        if isinstance(value, list):
            params.extend((key, v) for v in value)
        else:
            params.append((key, value))

    query = urllib.urlencode(sorted(params), doseq=1)
    return urlparse.urlunsplit((scheme, netloc, path, query, ''))
예제 #16
0
파일: urls.py 프로젝트: adamchainz/aleph
def normalize_url(url):
    # TODO: learn from https://github.com/hypothesis/h/blob/master/h/api/uri.py
    try:
        norm = urlnorm.norm(url)
        norm, _ = urldefrag(norm)
        return norm.rstrip('/')
    except:
        return None
예제 #17
0
def normalize_url(url):
    norm_url = urlnorm.norm(url)
    if norm_url.startswith("https://"):
        return norm_url[8:]
    elif norm_url.startswith("http://"):
        return norm_url[7:]
    else:
        return norm_url
예제 #18
0
파일: url.py 프로젝트: carriercomm/yeti
 def clean(self):
     """Ensures that URLs are canonized before saving"""
     self.value = refang(self.value.strip())
     try:
         if re.match(r"[^:]+://", self.value) is None:  # if no schema is specified, assume http://
             self.value = u"http://{}".format(self.value)
         self.value = urlnorm.norm(self.value)
     except urlnorm.InvalidUrl:
         raise ObservableValidationError("Invalid URL: {}".format(self.value))
예제 #19
0
 def fetch(self, method, endpoint, params):
     api_endpoint = norm(self.api_base + endpoint)
     content = self.oauth.request(
         method,
         api_endpoint,
         params=params,
         headers={'User-Agent': 'Semantics3 Python Lib/0.2'})
     print(content)
     return content
예제 #20
0
파일: url.py 프로젝트: rlugojr/krauler
def normalize_url(url):
    # TODO: learn from https://github.com/hypothesis/h/blob/master/h/api/uri.py
    try:
        url = urlnorm.norm(url)
        url, _ = urldefrag(url)
        url = url.rstrip("/")
        return url
    except:
        return None
예제 #21
0
 def new(cls, *args, **kwargs):
     obj = cls(*args)
     obj.source = kwargs['source']
     obj.duplicates = 0
     obj.priority = 0
     # normalize url
     if hasattr(obj, 'url'):
         obj.url = urlnorm.norm(obj.url)
     return obj
예제 #22
0
def googleSearch ( searchString ):
    g = pygoogle(searchString)
    g.pages = 2
    urls = g.get_urls()
    urls = urls[:10]
    for i in range(len(urls)):
        urls[i]=unicode(urlnorm.norm(urls[i]))

    return urls
예제 #23
0
 def new(cls, *args, **kwargs):
     obj = cls(*args)
     obj.source = kwargs['source']
     obj.duplicates = 0
     obj.priority = 0
     # normalize url
     if hasattr(obj, 'url'):
         obj.url = urlnorm.norm(obj.url)
     return obj
예제 #24
0
def createMetaResources(md5v, dataset):
    with Timer(key='createMetaResources'):
        res = getDistributionAccessURLs(dataset) + getDistributionDownloadURLs(
            dataset)
        bulk_mr = []
        uris = []
        for uri in res:
            valid = True
            try:
                uri = urlnorm.norm(uri.strip())
            except Exception as e:
                log.debug("URIFormat", uri=uri, md5=md5v, msg=e.message)
                uri = uri
                valid = False

            f = getDistributionFormatWithURL(dataset, uri)
            m = getDistributionMediaTypeWithURL(dataset, uri)
            s = getDistributionSizeWithURL(dataset, uri)
            c = getDistributionCreationDateWithURL(dataset, uri)
            mod = getDistributionModificationDateWithURL(dataset, uri)
            try:
                s_uri = safe_url_string(uri, 'utf-8')
                uri = escape_ajax(s_uri)
            except Exception as exc:
                ErrorHandler.handleError(log,
                                         "safe_url_string",
                                         exception=exc,
                                         md5=md5,
                                         uri=uri,
                                         exc_info=True)
                uri = uri

            if uri in uris:
                log.debug("WARNING, duplicate URI",
                          dataset=dataset.id,
                          md5=md5v,
                          uri=uri,
                          format=f,
                          media=m)
                continue
            try:
                s = int(float(s)) if s is not None else None
            except Exception as e:
                s = None

            MR = MetaResource(uri=uri,
                              md5=md5v,
                              media=m,
                              valid=valid,
                              format=normaliseFormat(f),
                              size=s,
                              created=toDatetime(c),
                              modified=toDatetime(mod))
            bulk_mr.append(MR)
            uris.append(uri)
        return bulk_mr
예제 #25
0
 def fetch(self, method, endpoint, params):
     api_endpoint = norm(self.api_base + endpoint)
     content = self.oauth.request(
                 method,
                 api_endpoint,
                 params = params,
                 headers={'User-Agent':'Semantics3 Python Lib/0.2'}
               )
     print(content)
     return content
예제 #26
0
def dl_html(page):
    url = "http://en.wiktionary.org/wiki/%s" % page
    url = urlnorm.norm(url)

    # we should be able to crawl any page from the links we obtained
    # and we're obeying crawling delays here
    response = urllib2.urlopen(url.encode("utf8"), timeout=5)

    time.sleep(config.page_crawl_delay)
    return response.read()
예제 #27
0
def dl_html(page):
	url = "http://en.wiktionary.org/wiki/%s" % page
	url = urlnorm.norm(url)

	# we should be able to crawl any page from the links we obtained
	# and we're obeying crawling delays here
	response = urllib2.urlopen(url.encode("utf8"), timeout=5)

	time.sleep(config.page_crawl_delay)
	return response.read()
예제 #28
0
def canonizeurl(url):
    split = urlsplit(urlnorm.norm(url))
    path = split[2].split(" ")[0]
    while path.startswith("/.."):
        path = path[3:]
    while path.endswith("%20"):
        path = path[:-3]
    # qs = urlencode(sorted(parse_qsl(split.query)))
    qs = ""
    return urlunsplit((split.scheme, split.netloc, path, qs, ""))
예제 #29
0
	def __init__(self, url):
		"""Construct from a string or Django request."""
		nurl = urlnorm.norm(url.encode('utf-16').lower())
		if hasattr(nurl, 'get_full_path'):
			nurl = nurl.get_full_path()

		self.scheme, self.netloc, self.path, self.params, \
			self.query, self.fragment = urlparse.urlparse(nurl)
		filename, self.ftype = os.path.splitext(self.path)
		self.args = dict(cgi.parse_qsl(self.query))
예제 #30
0
def normalize_url(url, path=None):
    try:
        if path:
            url = urljoin(url, path)
        url = urlnorm.norm(url)
        # force HTTP protocol
        if url.startswith('http'):
            return url
    except urlnorm.InvalidUrl:
        pass
예제 #31
0
def dl_xml(params):
    url = "http://en.wiktionary.org/w/api.php?format=xml"
    for key, val in params.iteritems():
        url += "&%s=%s" % (key, val)
    url = urlnorm.norm(url)

    # We're permitted to crawl any page with the API regardless
    # of robots.txt since we're using the API
    response = urllib2.urlopen(url.encode("utf8"), timeout=5)

    time.sleep(config.api_crawl_delay)
    return response.read()
예제 #32
0
def dl_xml(params):
	url = "http://en.wiktionary.org/w/api.php?format=xml"
	for key, val in params.iteritems():
		url += "&%s=%s" % (key, val)
	url = urlnorm.norm(url)

	# We're permitted to crawl any page with the API regardless
	# of robots.txt since we're using the API
	response = urllib2.urlopen(url.encode("utf8"), timeout=5)

	time.sleep(config.api_crawl_delay)
	return response.read()
예제 #33
0
 def getImage(self,opener,url,data,wait_time):
     """
     Directly get an Image using URLLib. Errors Must be handled.
     
     *Optional Parameters*
     
     :param opener: urllib opener to use (use GetPage for setup)
     :param url: url address to use
     :param data: data to use in request (like that passed to urlencode)
     :param wait_time: time to wait for request
     
     """
     return opener.open(urlnorm.norm(url),data,wait_time).read()
예제 #34
0
 def clean(self):
     """Ensures that URLs are canonized before saving"""
     self.value = refang(self.value.strip())
     try:
         if re.match(r"[^:]+://", self.value) is None:
             # if no schema is specified, assume http://
             self.value = u"http://{}".format(self.value)
         self.value = urlnorm.norm(self.value)
         self.parse()
     except urlnorm.InvalidUrl:
         raise ObservableValidationError("Invalid URL: {}".format(self.value))
     except UnicodeDecodeError:
         raise ObservableValidationError("Invalid URL (UTF-8 decode error): {}".format(self.value))
예제 #35
0
 def getFileName(self, url, folder=None):
     url_norm = urlnorm.norm(url.strip())
     url_fname = urllib.quote_plus(url_norm)
     if folder:
         submit_path = os.path.join(self.submit_folder[folder], url_fname)
         if os.path.exists(submit_path):
             return os.readlink(submit_path)
     else:
         for f in self.submit_folder:
             submit_path = os.path.join(self.submit_folder[f], url_fname)
             if os.path.exists(submit_path):
                 return os.readlink(submit_path)
     return None
def extract_links(body):
  links = []
  for link in HTML_TAG_PATTERN.findall(body):
    try:
      link = link[2]
      netloc = urlparse.urlparse(link).netloc
      if (netloc in domains_of_interest):
        link = urlnorm.norm(link)
        links.append(link)
    except:
      pass

  return links
예제 #37
0
파일: __init__.py 프로젝트: svven/summary
 def _clean_url(self, url):
     """
     Canonicalizes the url, as it is done in Scrapy.
     And keeps only USEFUL_QUERY_KEYS. It also strips the 
     trailing slash to help identifying dupes.
     """
     # TODO: Turn this into regex
     if not url.startswith('http') or url.endswith('}}') or 'nojs_router' in url:
         return None
     if site(norm(url).lower()) in config.NONCANONIC_SITES:
         clean_url = canonicalize_url(url, keep_params=True)
     else:
         clean_url = canonicalize_url(url)
     return clean_url
예제 #38
0
def normalize_canonical_url(url, use_url_norm=True):
    try:
        if use_url_norm:
            url = urlnorm.norm(url)

        scheme, netloc, path, params, query, fragment = urlparse.urlparse(url)
        host = urlparse.urlunparse((scheme, netloc, '', '', '', ''))
        path = urlparse.urlunparse(('', '', path, params, query, fragment))
        path = shebang_regex.sub('/', path)
        url = host + path

        return url.rstrip('/')
    except Exception:
        return None
예제 #39
0
    def normalize(self, url):
        parsed = urlparse(url.encode('utf-8'))

        if '//' not in url:
            url = '%s%s' % ('http://', url)

        if parsed.scheme == "http" or "https":
            try:
                normalized_url = urlnorm.norm(url)
            except:
                return None
            return normalized_url
        else:
            return None
예제 #40
0
def normalize_canonical_url(url, use_url_norm=True):
    try:
        if use_url_norm:
            url = urlnorm.norm(url)

        scheme, netloc, path, params, query, fragment = urlparse.urlparse(url)
        host = urlparse.urlunparse((scheme, netloc, '', '', '', ''))
        path = urlparse.urlunparse(('', '', path, params, query, fragment))
        path = shebang_regex.sub('/', path)
        url = host + path
        
        return url.rstrip('/')
    except Exception:
        return None
예제 #41
0
파일: url.py 프로젝트: yuankeyang/yeti
    def normalize(self):
        self.value = refang(self.value)

        try:
            if re.match(r"[^:]+://", self.value) is None:
                # if no schema is specified, assume http://
                self.value = u"http://{}".format(self.value)
            self.value = urlnorm.norm(self.value).replace(' ', '%20')
            self.parse()
        except urlnorm.InvalidUrl:
            raise ObservableValidationError("Invalid URL: {}".format(
                self.value))
        except UnicodeDecodeError:
            raise ObservableValidationError(
                "Invalid URL (UTF-8 decode error): {}".format(self.value))
예제 #42
0
 def getImageSpynner(self,baseurl,spynner,iser,wait_time,proxy):
     """
     Directly get an Image with Spynner.
     
     *Required Parameters*
     
     :param baseurl: base url to use  with link (a blank string is nothing)
     :param spynner: spynner instance
     :param iser: selector for image
     :param wait_time: time to wait in acquiring an image
     :param proxy: String proxy
     """
     br=spynner
     print "Downloading..."+str(iser["src"])
     return br.download(urlnorm.norm(baseurl+iser["src"]),outfd=None,timeout=wait_time,proxy_url=proxy)
예제 #43
0
파일: url.py 프로젝트: raymundl/yeti
    def normalize(self):
        self.value = refang(self.value)

        try:
            if re.match(r"[^:]+://", self.value) is None:
                # if no schema is specified, assume http://
                self.value = u"http://{}".format(self.value)
            self.value = urlnorm.norm(self.value).replace(' ', '%20')
            self.parse()
        except urlnorm.InvalidUrl:
            raise ObservableValidationError(
                "Invalid URL: {}".format(self.value))
        except UnicodeDecodeError:
            raise ObservableValidationError(
                "Invalid URL (UTF-8 decode error): {}".format(self.value))
예제 #44
0
파일: parse.py 프로젝트: wilbrodn/aleph
def parse_url(text):
    """Clean and verify a URL."""
    # TODO: learn from https://github.com/hypothesis/h/blob/master/h/api/uri.py
    url = stringify(text)
    if url is not None:
        if url.startswith('//'):
            url = 'http:' + url
        elif '://' not in url:
            url = 'http://' + url
        try:
            norm = urlnorm.norm(url)
            norm, _ = urldefrag(norm)
            return norm
        except:
            return None
    return None
예제 #45
0
def norm_url(url):
    url = uni(url).encode('utf-8')
    try:
        return urlnorm.norm(url)
    except urlnorm.InvalidUrl:
        # Happens when the URL is relative. Call path normalization directly.
        try:
            return urlnorm.norm_path('', url)
        except UnicodeDecodeError:
            return url

    except UnicodeDecodeError:
        # work around for bug in urlnorm on unicode url
        return url
    except:
        traceback.print_exc()
    return None
예제 #46
0
def norm_url(url):
    url = uni(url).encode('utf-8')
    try:
        return urlnorm.norm(url)
    except urlnorm.InvalidUrl:
        # Happens when the URL is relative. Call path normalization directly.
        try:
            return urlnorm.norm_path('', url)
        except UnicodeDecodeError:
            return url

    except UnicodeDecodeError:
        # work around for bug in urlnorm on unicode url
        return url
    except:
        traceback.print_exc()
    return None
예제 #47
0
def cert_chain_url_valid(cert_url):
    """
    Ensure that the provided URL for the certificate chain is valid, by checking that:
    * it's HTTPS
    * the host is s3.amazonaws.com
    * the port, if specified, is 443
    * the path starts with '/echo.api/'
    """
    normalized = urlnorm.norm(cert_url)
    parsed = urlparse.urlparse(normalized)
    url_checks = {
        'scheme': parsed.scheme == 'https',
        'hostname': parsed.hostname == 's3.amazonaws.com',
        'port': parsed.port in (443, None),
        'path': parsed.path.startswith('/echo.api/'),
    }
    all_checks_pass = all(url_checks.values())
    return all_checks_pass
예제 #48
0
    def _prepareURL(self, apiQueryURI):
        """
        If the URI (actually just a partial URL, usually the path part) doesn't begin with
        the base URL for the API, concatenate the two into a new URL and return it.

        :param apiQueryURI: URI (actually, just a partial URL, usually the path part) for an API entry point.
        :type apiQueryURI: str
        :return: URL for the API query, ready for use
        :rtype: str
        """
        assert isinstance(apiQueryURI, str)
        assert not util.stringContainsAllCharacters(apiQueryURI, '{}'), \
            'apiQueryURI contains unformatted arguments: "%s"' % apiQueryURI

        if apiQueryURI.startswith(self.apiBaseURL):
            return apiQueryURI

        return urlnorm.norm(self.apiBaseURL + '/' + apiQueryURI)
예제 #49
0
def extract_urls(text, regex):
    results = dict()

    for i in regex.finditer(text):
        try:
            url = urlnorm.norm(i.group(1).strip())
            url_parsed = url_parser(url)
            if results.get(url_parsed.host):
                results[url_parsed.host].add(url)
            else:
                results[url_parsed.host] = set(url)
            log.debug("Parsed domain: {}".format(url_parsed.host))
        except urlnorm.InvalidUrl:
            log.warning("Parsing invalid url: {}".format(url))
        except:
            log.exception("Failed parsing url: {}".format(url))

    return results
예제 #50
0
    def on_data(self, data):
        tweet_data = json.loads(data)
        if 'limit' in tweet_data:
            print("Limit:" + str(tweet_data["limit"]))
        else:
            #insert into tweet db
            tweet = tweet_data["text"]
            username = tweet_data["user"]["screen_name"]
            #lat = tweet_data[]
            #long = tweet_data[]

            c.execute("INSERT INTO tweet (time, username, tweet) VALUES (%s,%s,%s)",
                (time.time(), username, tweet))
            tweet_id = c.lastrowid

            # insert full urls into DB
            for url in tweet_data["entities"]["urls"]:

                # process URL
                norm_url = urlnorm.norm(url["expanded_url"])
                norm_url_tuple = urlparse.urlparse(norm_url)

                # unshorten URLs for common URL minimizer services
                if norm_url_tuple[1] in URL_SHORTENERS:
                    norm_url = unshorten_url(norm_url)
                    norm_url_tuple = urlparse.urlparse(norm_url)

                md5_url = hashlib.md5()
                md5_url.update(norm_url.encode("utf-8"))

                c.execute("INSERT INTO url (url, domain, url_hash) VALUES (%s,%s,%s)",
                          (norm_url, norm_url_tuple[1], md5_url.hexdigest()))
                url_id = c.lastrowid
                c.execute("INSERT INTO tweet_urls (tweet_id, url_id) VALUES (%s,%s)",
                          (tweet_id, url_id))



            conn.commit()
            self.tweet_count += 1
            if self.tweet_count % 1000 == 0:
                print self.tweet_count

        return True
예제 #51
0
    def scrape(self):
        stories = self._scrape()
        # If we've scraped the same canonical URL twice, we will just choose the first one
        urls = set()
        for story in stories:
            try:
                url = urlnorm.norm(story.url)
            except:
                # If we've scraped a bad UTF-8 character here, this might fail
                url = story.url

            if url in urls:
                stories.remove(story)
            else:
                urls.add(url)
                story.url = url
                story.title = story.title.strip()

        return stories
예제 #52
0
def storeURL(url, path, max_file_size):
    #download URL and send fileID
    log.debug("downloading url", url=url, max_file_size=max_file_size )
    try:
        r = requests.get(url, stream=True)
        size = 0
        ctt = StringIO()
    
        sig = hashlib.md5()
        for chunk in r.iter_content(2048):
            size += len(chunk)
            ctt.write(chunk)
            sig.update(chunk)
            if size >  max_file_size:
                r.close()
                raise RequestEntityTooLarge()
    
        md5 = sig.hexdigest()
        ctt.seek(0)
        
        fpath=os.path.join(path, md5)
        if os.path.exists(fpath):
            print 'file exists', fpath
            return md5
        log.debug("storing url", url=url, file=fpath)
        with open (fpath,'w') as fd:
            t = ctt.read(1048576)
            while t:
                fd.write(t)
                t = ctt.read(1048576)
        
        url_norm = urlnorm.norm(url.strip())
        url_fname = urllib.quote_plus(url_norm)
        f = os.path.join(path, url_fname)

        
        os.symlink(fpath,f)
        log.debug("url stored", url=url, file=fpath)
        
        return md5
    except Exception as e:
        raise e
예제 #53
0
 def save_link(cls, title, url, body="", tags=[], clicks=0, unread=True):
     url = norm(url)
     id = mmh3.hash(url)
     key = ndb.Key(LinkModel, id)
     domain = urlparse(url).netloc
     if len(domain) > 4 and domain.startswith('www.'):
         domain = domain[4:]
     link = LinkModel(key=key,
                      title=title,
                      url=url,
                      domain=domain,
                      body=body,
                      tags=tags,
                      clicks=clicks,
                      unread=unread)
     link.put()
     id = str(link.id)
     doc = cls._buildDoc(id, title, body, domain, tags)
     cls.add(doc)
     return cls(doc, link)
예제 #54
0
def main():

    print "\n.: BUCKLEGRIPPER v0.1 https://github.com/hadojae/DATA :."

    parser = argparse.ArgumentParser(description='Visit a suspected phishing page, screenshot it and pillage it for phishing archives')
    parser.add_argument('-u','--url', help='Url to visit',required=False,default=False)
    parser.add_argument('-s','--source', help='Apply a source to where this url came from',required=False,default="bucklegripper")
    parser.add_argument('-r','--readfile', help='Read in a file of URLs one per line',required=False,default=False)
    parser.add_argument('-a','--useragent', help='Custom User-Agent',required=False,default="Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.85 Safari/537.36")

    args = parser.parse_args()
    user_agent = args.useragent
    full = args.url
    source = args.source
    readfile = args.readfile

    if full == False and readfile == False:
        print bcolors.FAIL + "\n[-] You have to enter either a url with '-u' to analyze or specify a file with urls in it with '-r'\n" + bcolors.ENDC
        sys.exit() 

    # "setup fake ua for urllib2 requests"
    headers = { 'User-Agent' : user_agent }

    if readfile == False:
        mainloop(full, headers, user_agent, source)
        sys.exit()
    else:
        print "\n[+] Beginning processing of " + readfile
        with open(readfile) as f:
            content = f.readlines()
            for line in content:
                #catch bad url
                try:
                    full = urlnorm.norm(line).strip('\n')
                except Exception:
                    print bcolors.FAIL + "[-] " + line + " is a Malformed URI" + bcolors.ENDC
                    continue 
  		mainloop(full, headers, user_agent, source)
        print "\n[+] Finished processing " + readfile + '\n'
        sys.exit() 
예제 #55
0
    def enqueue(self, url, *args):
        # We add explored bool too. Since links that
        # are not explored only can average prev scores
        # normalizedURL = url
        if (len(args) != 1):
            crawlerLogger.error("Required was Priority but more args supplied")

        priority = args[0]
        try:
            normalizedURL = urlnorm.norm(url)

            if (normalizedURL not in self._linkDict):
                self._linkDict[normalizedURL] = (priority, False)
            else:
                # Average the two scores if found
                prevPriority, explored = self._linkDict[normalizedURL]
                if (not explored):
                    self._linkDict[normalizedURL] = (
                        (prevPriority + priority) / 2, False)
        except Exception as e:
            crawlerLogger.warn("Normalization Issues. Not Enqueing " + url)
        self._buildHeap()