Python Lookyloo примеры использования

Язык программирования: Python

Пространство имен/Пакет: lookyloo.lookyloo

Класс/Тип: Lookyloo

Примеров на hotexamples.com: 14

Python Lookyloo - 14 примеров найдено. Это лучшие примеры Python кода для lookyloo.lookyloo.Lookyloo, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Lookyloo(8)

get_crawled_tree(3)

sorted_capture_cache(3)

capture_cache(2)

rebuild_all(2)

rebuild_cache(2)

_ensure_meta(1)

categories_capture(1)

process_capture_queue(1)

process_scrape_queue(1)

remove_pickle(1)

trigger_modules(1)

update_tree_cache_info(1)

Пример #1

Показать файл

 def __init__(self, loglevel: int = logging.INFO):
     super().__init__(loglevel)
     self.lookyloo = Lookyloo()
     self.script_name = 'background_indexer'
     # make sure discarded captures dir exists
     self.discarded_captures_dir = self.lookyloo.capture_dir.parent / 'discarded_captures'
     self.discarded_captures_dir.mkdir(parents=True, exist_ok=True)

Пример #2

Показать файл

 def __init__(self,
              storage_directory: Path = None,
              loglevel: int = logging.INFO):
     super().__init__(loglevel)
     if not storage_directory:
         self.storage_directory = get_homedir() / 'scraped'
     self.lookyloo = Lookyloo()

Пример #3

Показать файл

def main():
    parser = argparse.ArgumentParser(description='Rebuild the redis cache.')
    parser.add_argument('--rebuild_pickles', default=False, action='store_true', help='Delete and rebuild the pickles. Count 20s/pickle, it can take a very long time.')
    args = parser.parse_args()

    lookyloo = Lookyloo()
    if args.rebuild_pickles:
        lookyloo.rebuild_all()
    else:
        lookyloo.rebuild_cache()

    indexing = Indexing()
    indexing.clear_indexes()
    for capture_uuid in lookyloo.capture_uuids:
        index = True
        try:
            tree = lookyloo.get_crawled_tree(capture_uuid)
        except Exception as e:
            print(capture_uuid, e)
            continue

        if lookyloo.is_public_instance:
            cache = lookyloo.capture_cache(capture_uuid)
            if not cache:
                continue
            if cache.no_index is not None:
                index = False

        # NOTE: these methods do nothing if we just generated the pickle when calling lookyloo.get_crawled_tree
        if index:
            indexing.index_cookies_capture(tree)
            indexing.index_body_hashes_capture(tree)
            indexing.index_url_capture(tree)
            categories = list(lookyloo.categories_capture(capture_uuid).keys())
            indexing.index_categories_capture(capture_uuid, categories)

Пример #4

Показать файл

Файл: async_capture.py Проект: lucaadrian/lookyloo

 def __init__(self, loglevel: int = logging.INFO):
     super().__init__(loglevel)
     self.lookyloo = Lookyloo()
     self.script_name = 'async_capture'
     self.only_global_lookups: bool = get_config('generic',
                                                 'only_global_lookups')
     self.capture_dir: Path = get_captures_dir()
     self.splash_url: str = get_splash_url()
     self.redis = Redis(unix_socket_path=get_socket_path('cache'),
                        decode_responses=True)

Пример #5

Показать файл

Файл: async_capture.py Проект: ninoseki/lookyloo

class AsyncCapture(AbstractManager):

    def __init__(self, storage_directory: Optional[Path]=None, loglevel: int=logging.INFO):
        super().__init__(loglevel)
        if not storage_directory:
            self.storage_directory = get_homedir() / 'scraped'
        self.lookyloo = Lookyloo()

    def _to_run_forever(self):
        set_running('async_capture')
        while True:
            url = self.lookyloo.process_capture_queue()
            if url is None or shutdown_requested():
                break
        unset_running('async_capture')

Пример #6

Показать файл

Файл: async_scrape.py Проект: sernle/lookyloo

class AsyncScraper(AbstractManager):
    def __init__(self,
                 storage_directory: Path = None,
                 loglevel: int = logging.INFO):
        super().__init__(loglevel)
        if not storage_directory:
            self.storage_directory = get_homedir() / 'scraped'
        self.lookyloo = Lookyloo(loglevel=loglevel,
                                 only_global_lookups=only_global_lookups)

    def _to_run_forever(self):
        set_running('async_scrape')
        while True:
            url = self.lookyloo.process_scrape_queue()
            if url is None or shutdown_requested():
                break
        unset_running('async_scrape')

Пример #7

Показать файл

def main():
    parser = argparse.ArgumentParser(description='Rebuild the redis cache.')
    parser.add_argument(
        '--rebuild_pickles',
        default=False,
        action='store_true',
        help=
        'Delete and rebuild the pickles. Count 20s/pickle, it can take a very long time.'
    )
    args = parser.parse_args()

    lookyloo = Lookyloo()
    if args.rebuild_pickles:
        lookyloo.rebuild_all()
    else:
        lookyloo.rebuild_cache()

    indexing = Indexing()
    indexing.clear_indexes()

    # This call will rebuild all the caches as needed.
    lookyloo.sorted_capture_cache()

Пример #8

Показать файл

Файл: genericapi.py Проект: lucaadrian/lookyloo

import json
from typing import Any, Dict

import flask_login  # type: ignore
from flask import request, send_file
from flask_restx import Namespace, Resource, abort, fields  # type: ignore
from werkzeug.security import check_password_hash

from lookyloo.helpers import splash_status
from lookyloo.lookyloo import Lookyloo

from .helpers import build_users_table, load_user_from_request, src_request_ip

api = Namespace('GenericAPI', description='Generic Lookyloo API', path='/')

lookyloo: Lookyloo = Lookyloo()


def api_auth_check(method):
    if flask_login.current_user.is_authenticated or load_user_from_request(
            request):
        return method
    abort(403, 'Authentication required.')


token_request_fields = api.model(
    'AuthTokenFields', {
        'username': fields.String(description="Your username", required=True),
        'password': fields.String(description="Your password", required=True),
    })

Пример #9

Показать файл

class BackgroundIndexer(AbstractManager):
    def __init__(self, loglevel: int = logging.INFO):
        super().__init__(loglevel)
        self.lookyloo = Lookyloo()
        self.script_name = 'background_indexer'
        # make sure discarded captures dir exists
        self.discarded_captures_dir = self.lookyloo.capture_dir.parent / 'discarded_captures'
        self.discarded_captures_dir.mkdir(parents=True, exist_ok=True)

    def _to_run_forever(self):
        self._build_missing_pickles()
        self._check_indexes()

    def _build_missing_pickles(self):
        for uuid_path in sorted(self.lookyloo.capture_dir.glob('**/uuid'),
                                reverse=True):
            if (uuid_path.parent / 'tree.pickle').exists():
                continue
            lock_file = uuid_path.parent / 'lock'
            if lock_file.exists():
                try:
                    with lock_file.open('r') as f:
                        lock_ts = datetime.fromisoformat(f.read())
                    if lock_ts < datetime.now() - timedelta(minutes=5):
                        # Clear old locks. They shouldn't be there, but it's gonna happen.
                        self.logger.info(
                            f'Old lock found {lock_file}, removing it.')
                        lock_file.unlink(missing_ok=True)
                except Exception as e:
                    self.logger.info(
                        f'Error while reading lock {lock_file}: {e}')
                continue

            with uuid_path.open() as f:
                uuid = f.read()
            if not self.lookyloo.redis.hexists('lookup_dirs', uuid):
                # The capture with this UUID exists, but it is for some reason missing in lookup_dirs
                self.lookyloo.redis.hset('lookup_dirs', uuid,
                                         str(uuid_path.parent))

            try:
                self.logger.info(
                    f'Build pickle for {uuid}: {uuid_path.parent.name}')
                self.lookyloo.get_crawled_tree(uuid)
                self.lookyloo.trigger_modules(uuid, auto_trigger=True)
                self.logger.info(f'Pickle for {uuid} build.')
            except MissingUUID:
                self.logger.warning(
                    f'Unable to find {uuid}. That should not happen.')
            except NoValidHarFile:
                self.logger.warning(
                    f'Unable to build pickle for {uuid}: {uuid_path.parent.name}'
                )
                # The capture is not working, moving it away.
                self.lookyloo.redis.hdel('lookup_dirs', uuid)
                uuid_path.parent.rename(self.discarded_captures_dir /
                                        uuid_path.parent.name)

    def _check_indexes(self):
        index_redis = self.lookyloo.indexing.redis
        for cache in self.lookyloo.sorted_capture_cache():
            if self.lookyloo.is_public_instance and cache.no_index:
                # Capture unindexed
                continue
            p = index_redis.pipeline()
            p.sismember('indexed_urls', cache.uuid)
            p.sismember('indexed_body_hashes', cache.uuid)
            p.sismember('indexed_cookies', cache.uuid)
            indexed = p.execute()
            if all(indexed):
                continue
            try:
                ct = self.lookyloo.get_crawled_tree(cache.uuid)
            except NoValidHarFile:
                self.logger.warning(f'Broken pickle for {cache.uuid}')
                self.lookyloo.remove_pickle(cache.uuid)
                continue

            if not indexed[0]:
                self.logger.info(f'Indexing urls for {cache.uuid}')
                self.lookyloo.indexing.index_url_capture(ct)
            if not indexed[1]:
                self.logger.info(f'Indexing resources for {cache.uuid}')
                self.lookyloo.indexing.index_body_hashes_capture(ct)
            if not indexed[2]:
                self.logger.info(f'Indexing cookies for {cache.uuid}')
                self.lookyloo.indexing.index_cookies_capture(ct)

Пример #10

Показать файл

Файл: stats.py Проект: ninoseki/lookyloo

from lookyloo.lookyloo import Lookyloo
import calendar
import datetime
from urllib.parse import urlparse
from typing import Dict, Any, Union, Set

lookyloo = Lookyloo()

stats: Dict[Union[str, int], Any] = {}

today = datetime.date.today()
calendar_week = today.isocalendar()[1]
weeks_stats: Dict[int, Dict[str, Union[int, Set[str]]]] = \
    {calendar_week - 1: {'analysis': 0, 'analysis_with_redirects': 0, 'redirects': 0, 'uniq_urls': set()},
     calendar_week: {'analysis': 0, 'analysis_with_redirects': 0, 'redirects': 0, 'uniq_urls': set()}}


def uniq_domains(uniq_urls):
    domains = set()
    for url in uniq_urls:
        splitted = urlparse(url)
        domains.add(splitted.hostname)
    return domains


for uuid in lookyloo.capture_uuids:
    cache = lookyloo.capture_cache(uuid)
    if not cache or not hasattr(cache, 'timestamp'):
        continue
    date = cache.timestamp
    if date.year not in stats:

Пример #11

Показать файл

Файл: __init__.py Проект: CrackerCat/lookyloo

secret_file_path = get_homedir() / 'secret_key'

if not secret_file_path.exists() or secret_file_path.stat().st_size < 64:
    with secret_file_path.open('wb') as f:
        f.write(os.urandom(64))

with secret_file_path.open('rb') as f:
    app.config['SECRET_KEY'] = f.read()

Bootstrap(app)
app.config['BOOTSTRAP_SERVE_LOCAL'] = True
app.config['SESSION_COOKIE_NAME'] = 'lookyloo'
app.debug = False

lookyloo = Lookyloo()


# keep
def load_tree(report_dir):
    session.clear()
    temp_file_name, tree_json, tree_time, tree_ua, tree_root_url, meta = lookyloo.load_tree(
        report_dir)
    session["tree"] = temp_file_name
    return tree_json, tree_time, tree_ua, tree_root_url, meta


@app.route('/submit', methods=['POST', 'GET'])
def submit():
    to_query = request.get_json(force=True)
    perma_uuid = lookyloo.enqueue_scrape(to_query)

Пример #12

Показать файл

app.config['BOOTSTRAP_SERVE_LOCAL'] = True
app.config['SESSION_COOKIE_NAME'] = 'lookyloo'
app.debug = False

# API entry point for splash
if os.environ.get('SPLASH_URL'):
    splash_url = os.environ.get('SPLASH_URL')
else:
    splash_url = 'http://127.0.0.1:8050'
# Splash log level
loglevel = logging.DEBUG
# Set it to True if your instance is publicly available so users aren't able to scan your internal network
only_global_lookups = False

lookyloo = Lookyloo(splash_url=splash_url,
                    loglevel=loglevel,
                    only_global_lookups=only_global_lookups)


# keep
def load_tree(report_dir):
    session.clear()
    temp_file_name, tree_json, tree_time, tree_ua, tree_root_url, meta = lookyloo.load_tree(
        report_dir)
    session["tree"] = temp_file_name
    return tree_json, tree_time, tree_ua, tree_root_url, meta


@app.route('/submit', methods=['POST', 'GET'])
def submit():
    to_query = request.get_json(force=True)

Пример #13

Показать файл

from lookyloo.lookyloo import Lookyloo
import calendar
import datetime
from urllib.parse import urlparse
from typing import Dict, Any, Union, Set

lookyloo = Lookyloo()

stats: Dict[Union[str, int], Any] = {}

today = datetime.date.today()
calendar_week = today.isocalendar()[1]
weeks_stats: Dict[int, Dict[str, Union[int, Set[str]]]] = \
    {calendar_week - 1: {'analysis': 0, 'analysis_with_redirects': 0, 'redirects': 0, 'uniq_urls': set()},
     calendar_week: {'analysis': 0, 'analysis_with_redirects': 0, 'redirects': 0, 'uniq_urls': set()}}


def uniq_domains(uniq_urls):
    domains = set()
    for url in uniq_urls:
        splitted = urlparse(url)
        domains.add(splitted.hostname)
    return domains


for cache in lookyloo.sorted_capture_cache():
    date = cache.timestamp
    if date.year not in stats:
        stats[date.year] = {}
    if date.month not in stats[date.year]:
        stats[date.year][date.month] = {

Пример #14

Показать файл

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from lookyloo.lookyloo import Lookyloo

lookyloo = Lookyloo()

for capture_dir in lookyloo.capture_dirs:
    try:
        ct = lookyloo.get_crawled_tree(capture_dir)
    except Exception:
        continue
    lookyloo._ensure_meta(capture_dir, ct)