def get_urls_from_text(url):
    links = set()
    text = get_text(url)
    words = text.split()

    for word in words:
        if '.onion' in word:
            index = word.index('.onion')
            new_url = word[index-16:index] + '.onion'
            links.add(new_url)

    for link in links:
        try:
            link = get_url(link)
            found = crawler_backend.check_url(link)
            if found:
                print('\n' + link + ' is already in the database.')
                print('Skipped...')
                continue
            else:
                print('\nInserting... ' + link)
                crawler_backend.insert_page(link)
                crawler_backend.insert_status(link, 'Alive')
                print('Status : Alive')
                
        except:
            print('\nInserting... ' + link)
            link = 'http://' + link
            crawler_backend.insert_page(link)
            crawler_backend.insert_status(link, 'Offline')
            print('Status : Offline')
            pass
Пример #2
0
def crawl(url):
    url = get_url(url)
    links = get_links(url)

    for link in links:
        try:
            link = get_url(link)
            found = crawler_backend.check_url(link)
            if found:
                print('\n' + link + ' is already in the database.')
                print('Skipped...')
                continue
            else:
                print('\nInserting... ' + link)
                crawler_backend.insert_page(link)
                crawler_backend.insert_status(link, 'Alive')
                print('Status : Alive')

        except:
            print('\nInserting... ' + link)
            link = 'http://' + link
            crawler_backend.insert_page(link)
            crawler_backend.insert_status(link, 'Offline')
            print('Status : Offline')
            pass
def depth_crawl(url, depth):
    url = get_url(url)

    robots_list = get_robots_list(url)
    if url in robots_list:
        print('Not allowed to crawl url: ' + url + '\n')
        sys.exit()

    found = crawler_backend.check_url(url)
    if found:
        print(url + ' has already been crawled!\n')
        sys.exit()

    count = 0

    while count < depth:
        count += 1
        print('\nGetting level ' + str(count) + ' links...\n')
        try:
            if count == 1:
                links_list = get_links(url)
                for link in links_list:
                    retrieve_content(link)
                    # Pause after each page
                    time = random.randint(1, 5)
                    sleep(time)
            else:
                for link in links_list:
                    if link == url:
                        pass
                    temp_links = get_links(link)
                    for temp_link in temp_links:
                        if temp_link not in links_list:
                            links_list.append(temp_link)
                            retrieve_content(temp_link)
                            # Pause after each page
                            time = random.randint(1, 5)
                            sleep(time)

        except KeyboardInterrupt:
            print('')
            print('Program interrupted by user...')
            break

        except:
            check_error_status(url)
            print()
            crawler_backend.insert_content(url, 'ERROR: Page skipped')
            continue
def random_crawl(url, number_of_pages):
    url = get_url(url)
    count = 0

    robots_list = get_robots_list(url)
    if url in robots_list:
        print('Not allowed to crawl url: ' + url + '\n')
        url = find_new_url(url)

    found = crawler_backend.check_url(url)
    if found:
        print(url + ' has already been crawled!\n')
        print('Searching for a new page to crawl...\n')
        try:
            url = find_new_url(url)
        except:
            print('Unable to find any valid links\n')
            print('The application will now terminate!\n')
            sys.exit()

    while count < number_of_pages:
        try:
            retrieve_content(url)

        except KeyboardInterrupt:
            print('')
            print('Program interrupted by user...')
            break

        except:
            check_error_status(url)
            print()
            crawler_backend.insert_content(url, 'ERROR: Page skipped')
            url = find_new_url(url)
            continue

        count += 1

        # Pause after each page
        time = random.randint(1, 5)
        sleep(time)

        try:
            url = find_new_url(url)

        except:
            break
def find_new_url(url):
    page_found_counter = 0
    while True:
        url = change_url(url)
        found = crawler_backend.check_url(url)
        if not found:
            page_found_counter = 0
            url = get_url(url)
            return url
        else:
            print('Searching for a new page to crawl... (%s)\n' % url)
            page_found_counter += 1
            if page_found_counter == 30:
                print('\nThere are no more pages to collect')
                sys.exit()
            else:
                continue