예제 #1
0
def test_read_ini_file():
    """Test for reading ini file and parse key."""
    ini = Ini(os.path.join('files', 'config.ini'))

    # compare binary path
    binary_path = ini.read('HEADLESS', 'PATH')
    assert binary_path == 'files/phantomjs'

    # compare elasticsearch host
    es_host = ini.read('ELASTICSEARCH', 'HOST')
    assert es_host
예제 #2
0
def test_tor_ping_check():
    ini = Ini('files/config.ini')
    with Socket(tor_network=True, ini=ini) as socket:
        is_opened = socket.ping_check('facebookcorewwwi.onion', 80)
        assert is_opened == True

        is_closed = socket.ping_check('facebookcorewwwi.onion', 31337)
        assert is_closed == False
예제 #3
0
def test_tor_http():
    ini = Ini('files/config.ini')
    response = HTTP().request(url='https://facebookcorewwwi.onion',
                              tor_network=True,
                              ini=ini)

    assert response

    if response:
        assert response.headers
예제 #4
0
def run_crawler(self, url):
    Log.i(f"Starting crawler task for {url}")

    crawler = Crawler(ini=Ini(Env.read("CONFIG_FILE")))

    report = crawler.scan(url)

    if not report.is_empty() and report.webpage.url == url:
        crawler.save(self.request.id, report)

    del crawler
예제 #5
0
def test_load_crawler():
    ini = Ini('files/config.ini')
    crawler = Crawler(ini)
    assert crawler

    report = crawler.scan('http://wikitjerrta4qgz4.onion')
    assert type(report) == DynamicObject
    assert report.webpage.url == 'http://wikitjerrta4qgz4.onion'
    assert report.webpage.domain == 'wikitjerrta4qgz4.onion'

    del crawler
예제 #6
0
class SourceBase(object):
    """Base source object class format."""
    urls = []
    ini = Ini(Env.read('CONFIG_FILE'))
    active = True  # collector status

    def collect(self):
        """
        Run user custom method.
        :return:
        """
        pass

    def save(self):
        """
        Save domain on database and request crawling.
        :return: None
        """
        engine = Engine.create(self.ini)
        with Session(engine=engine) as session:
            for url in self.urls:
                task_id = uuid4().hex

                try:
                    # add url into database
                    session.add(Domain(uuid=task_id, url=url))
                    session.commit()

                    task = run_crawler.apply_async(args=(url, ),
                                                   task_id=task_id)
                    Log.i("Crawler issued a new task id {} at {}".format(
                        task.task_id, url))
                except:
                    Log.d(
                        "This {} url already saved into database.".format(url))
                finally:
                    self.urls.remove(url)
예제 #7
0
from database.session import Session
from database.engine import Engine
from database.models import Domain

from utils.config.env import Env
from utils.config.ini import Ini

ini = Ini(Env.read('CONFIG_FILE'))


def test_create_engine():
    """
    Test for create a new engine
    :return:
    """
    assert Engine.create(ini=ini)


def test_database_session():
    """
    Test for connect database session
    :return:
    """
    engine = Engine.create(ini=ini)
    with Session(engine=engine) as session:
        assert session


def test_manage_model():
    """
    Test for create a new table at memory database
예제 #8
0
"""
Celery application.
"""
from utils.config.ini import Ini
from utils.config.env import Env

from celery import Celery


ini = Ini(Env.read('CONFIG_FILE'))

# load celery app
app = Celery(
    'crawler_tasks',
    broker=ini.read('CELERY', 'BROKER_URL'),
    backend=ini.read('CELERY', 'RESULT_BACKEND'))
from utils.config.ini import Ini
from utils.network.headless import HeadlessBrowser
from utils.network.headless import InvalidURLException, InvalidHTMLException

import pytest


ini = Ini('files/config.ini')


def test_browser():
    """Test for running headless browser."""
    browser = HeadlessBrowser(ini=ini)
    browser.run(url='https://www.naver.com')

    screenshot = browser.get_screenshot()
    assert screenshot

    del browser


def test_tor_browser():
    """Test for running headless browser with tor proxy."""
    browser = HeadlessBrowser(
        ini=ini,
        tor_network=True
    )

    browser.run(url='http://wikitjerrta4qgz4.onion')

    screenshot = browser.get_screenshot()