Exemplo n.º 1
0
def expand_url(request, response):
    """
    Handle url expanding. Returns limited url info
    """
    db = request.context['db']
    user = request.context['user']

    # validating query params
    if 'short_url' not in request.params:
        response.status = HTTP_400
        return {'error': 'short_url GET param missing'}

    # validate url
    try:
        short_url = clean_url(request.params['short_url'])
    except ValueError:
        response.status = HTTP_400
        return {'error': 'short_url is not a valid URL'}

    # check if url exists
    url = db.find_one_url({
        'short_url': short_url,
        'created_by': ObjectId(user['_id'])
    })
    if not url:
        response.status = HTTP_404
        return {'error': 'short_url does not exist'}

    return {
        'short_url': request.params['short_url'],
        'long_url': url['long_url'],
    }
Exemplo n.º 2
0
def begin_crawl():
    print("pushing url info in to the stack")
    # explode out all of our category `start_urls` into subcategories
    completed_target = 0
    url = get_category_info(completed_target)
    print(len(url))

    #initialize queue
    if get_queue_length() > 0:
        clean_url()
    #print(url)

    for index in range(0, len(url)):
        #populate urls from page 1 to max
        if url[index]['completed'] == completed_target:
            new_url = {}
            #print(url[index])
            try:
                new_url['category1'] = url[index]['category1']
                new_url['category2'] = url[index]['category2']
                new_url['category3'] = url[index]['category3']
                new_url['category4'] = url[index]['category4']
                new_url['category5'] = url[index]['category5']
                new_url['category6'] = url[index]['category6']
                new_url['category7'] = url[index]['category7']
                new_url['pageunit'] = url[index]['pageunit']
                new_url['url'] = url[index]['url']

                #print(url[index]['url'].split("page=")[1])
                if url[index]['url'].split("page=")[1] == "1":
                    new_url['url'] = url[index]['url'].replace(
                        "?", "?bbn=1&dc&")
                    #print(new_url['url'])
            except:
                print(url[index])

            enqueue_url(new_url)

    print(get_queue_length())
    set_header_id(0)
    print("completed url pushing")
Exemplo n.º 3
0
def extract_feeds(html, url):
    """Extract feed urls from webpage"""
    w = BeautifulSoup(html) # FIXME handle errors
    feeds = []
    for node in w.find_all(is_feed_link):
        try:
            feed_url = node['href']
        except KeyError:
            pass
        else:
            feeds.append(clean_url(feed_url, url))
    return feeds
Exemplo n.º 4
0
def extract_feeds(html, url):
    """Extract feed urls from webpage"""
    w = BeautifulSoup(html)  # FIXME handle errors
    feeds = []
    for node in w.find_all('link', attrs={'type': 'application/rss+xml'}):
        try:
            feed_url = node['href']
        except KeyError:
            pass
        else:
            feeds.append(clean_url(feed_url, url))
    return feeds
Exemplo n.º 5
0
def extract_feeds(html, url):
    """Extract feed urls from webpage"""
    w = BeautifulSoup(html)  # FIXME handle errors
    feeds = []
    for node in w.find_all(is_feed_link):
        try:
            feed_url = node['href']
        except KeyError:
            pass
        else:
            feeds.append(clean_url(feed_url, url))
    return feeds
Exemplo n.º 6
0
def extract_feeds(html, url):
    """Extract feed urls from webpage"""
    w = BeautifulSoup(html) # FIXME handle errors
    feeds = []
    for node in w.find_all(
                'link',
                attrs={'type': 'application/rss+xml'}
            ):
        try:
            feed_url = node['href']
        except KeyError:
            pass
        else:
            feeds.append(clean_url(feed_url, url))
    return feeds
Exemplo n.º 7
0
def test_clean_url():
    """
    testing clean_url helper
    """
    bad = 123
    bad2 = ''
    good = 'http://google.com'
    without_scheme = 'google.com'
    with_trailing_slash = 'google.com/'

    with pytest.raises(ValueError):
        clean_url(bad)

    with pytest.raises(ValueError):
        clean_url(bad2)

    assert clean_url(good) == good
    assert clean_url(without_scheme) == good
    assert clean_url(with_trailing_slash) == good
Exemplo n.º 8
0
def short_url(request, response):
    """
    Handles url shortening
    """
    db = request.context['db']
    user = request.context['user']
    host = request.context['host']

    # check long_url param
    if 'long_url' not in request.params:
        response.status = HTTP_400
        return {'error': 'long_url GET param missing'}

    long_url = request.params['long_url']

    # validate url
    try:
        long_url = clean_url(long_url)
    except ValueError:
        response.status = HTTP_400
        return {'error': 'long_url is not a valid URL'}

    # validate code
    code = request.params.get('code')
    if code and len(code) > DB.MAX_CODE_LEN:
        response.status = HTTP_400
        return {'error': 'Code param must have a max length of 9'}

    # check if url already exists
    if code:
        query = db.find_one_url({
            'code': code,
            'created_by': ObjectId(user['_id'])
        })
    else:
        query = db.find_one_url({
            'long_url': long_url,
            'created_by': ObjectId(user['_id'])
        })

    exists = db.find_one_url(query)
    if exists:
        response.status = HTTP_409
        return {'error': 'long_url already exists'}

    # create url
    code = code or db.generate_url_code(host)
    short_url = '{}/{}'.format(host, code)
    url = {
        'short_url': short_url,
        'long_url': long_url,
        'code': code,
        'url_access': [],
        'created_at': datetime.datetime.now(),
        'created_by': ObjectId(user['_id']),
    }

    db.insert_url(url)

    response.status = HTTP_201
    return {'short_url': short_url}