def expand_url(request, response): """ Handle url expanding. Returns limited url info """ db = request.context['db'] user = request.context['user'] # validating query params if 'short_url' not in request.params: response.status = HTTP_400 return {'error': 'short_url GET param missing'} # validate url try: short_url = clean_url(request.params['short_url']) except ValueError: response.status = HTTP_400 return {'error': 'short_url is not a valid URL'} # check if url exists url = db.find_one_url({ 'short_url': short_url, 'created_by': ObjectId(user['_id']) }) if not url: response.status = HTTP_404 return {'error': 'short_url does not exist'} return { 'short_url': request.params['short_url'], 'long_url': url['long_url'], }
def begin_crawl(): print("pushing url info in to the stack") # explode out all of our category `start_urls` into subcategories completed_target = 0 url = get_category_info(completed_target) print(len(url)) #initialize queue if get_queue_length() > 0: clean_url() #print(url) for index in range(0, len(url)): #populate urls from page 1 to max if url[index]['completed'] == completed_target: new_url = {} #print(url[index]) try: new_url['category1'] = url[index]['category1'] new_url['category2'] = url[index]['category2'] new_url['category3'] = url[index]['category3'] new_url['category4'] = url[index]['category4'] new_url['category5'] = url[index]['category5'] new_url['category6'] = url[index]['category6'] new_url['category7'] = url[index]['category7'] new_url['pageunit'] = url[index]['pageunit'] new_url['url'] = url[index]['url'] #print(url[index]['url'].split("page=")[1]) if url[index]['url'].split("page=")[1] == "1": new_url['url'] = url[index]['url'].replace( "?", "?bbn=1&dc&") #print(new_url['url']) except: print(url[index]) enqueue_url(new_url) print(get_queue_length()) set_header_id(0) print("completed url pushing")
def extract_feeds(html, url): """Extract feed urls from webpage""" w = BeautifulSoup(html) # FIXME handle errors feeds = [] for node in w.find_all(is_feed_link): try: feed_url = node['href'] except KeyError: pass else: feeds.append(clean_url(feed_url, url)) return feeds
def extract_feeds(html, url): """Extract feed urls from webpage""" w = BeautifulSoup(html) # FIXME handle errors feeds = [] for node in w.find_all('link', attrs={'type': 'application/rss+xml'}): try: feed_url = node['href'] except KeyError: pass else: feeds.append(clean_url(feed_url, url)) return feeds
def extract_feeds(html, url): """Extract feed urls from webpage""" w = BeautifulSoup(html) # FIXME handle errors feeds = [] for node in w.find_all( 'link', attrs={'type': 'application/rss+xml'} ): try: feed_url = node['href'] except KeyError: pass else: feeds.append(clean_url(feed_url, url)) return feeds
def test_clean_url(): """ testing clean_url helper """ bad = 123 bad2 = '' good = 'http://google.com' without_scheme = 'google.com' with_trailing_slash = 'google.com/' with pytest.raises(ValueError): clean_url(bad) with pytest.raises(ValueError): clean_url(bad2) assert clean_url(good) == good assert clean_url(without_scheme) == good assert clean_url(with_trailing_slash) == good
def short_url(request, response): """ Handles url shortening """ db = request.context['db'] user = request.context['user'] host = request.context['host'] # check long_url param if 'long_url' not in request.params: response.status = HTTP_400 return {'error': 'long_url GET param missing'} long_url = request.params['long_url'] # validate url try: long_url = clean_url(long_url) except ValueError: response.status = HTTP_400 return {'error': 'long_url is not a valid URL'} # validate code code = request.params.get('code') if code and len(code) > DB.MAX_CODE_LEN: response.status = HTTP_400 return {'error': 'Code param must have a max length of 9'} # check if url already exists if code: query = db.find_one_url({ 'code': code, 'created_by': ObjectId(user['_id']) }) else: query = db.find_one_url({ 'long_url': long_url, 'created_by': ObjectId(user['_id']) }) exists = db.find_one_url(query) if exists: response.status = HTTP_409 return {'error': 'long_url already exists'} # create url code = code or db.generate_url_code(host) short_url = '{}/{}'.format(host, code) url = { 'short_url': short_url, 'long_url': long_url, 'code': code, 'url_access': [], 'created_at': datetime.datetime.now(), 'created_by': ObjectId(user['_id']), } db.insert_url(url) response.status = HTTP_201 return {'short_url': short_url}