Пример #1
0
def _load_selenium_db() -> typing.List[Link]:
    """Load link from the :mod:`selenium` database.

    The function reads the :class:`~darc.model.tasks.selenium.SeleniumQueueModel` table.

    Returns:
        List of loaded links from the :mod:`selenium` database.

    Note:
        At runtime, the function will load links with maximum number
        at :data:`~darc.db.MAX_POOL` to limit the memory usage.

    """
    now = datetime.datetime.now()
    if TIME_CACHE is None:
        sec_delta = 0
        max_score = now
    else:
        sec_delta = TIME_CACHE
        max_score = now - sec_delta

    with database.atomic():
        query: typing.List[SeleniumQueueModel] = (SeleniumQueueModel.select(
            SeleniumQueueModel.link).where(
                SeleniumQueueModel.timestamp <= max_score).order_by(
                    SeleniumQueueModel.timestamp).limit(MAX_POOL))
        link_pool = [model.link for model in query]

        if TIME_CACHE is not None:
            new_score = now + sec_delta
            _save_selenium_db(link_pool,
                              score=new_score)  # force update records
    return link_pool
Пример #2
0
def _load_requests_db() -> 'List[Link]':
    """Load link from the :mod:`requests` database.

    The function reads the :class:`~darc.model.tasks.requests.RequestsQueueModel` table.

    Returns:
        List of loaded links from the :mod:`requests` database.

    Note:
        At runtime, the function will load links with maximum number
        at :data:`~darc.db.MAX_POOL` to limit the memory usage.

    """
    now = datetime.now()
    if TIME_CACHE is None:
        sec_delta = timedelta(seconds=0)
        max_score = now
    else:
        sec_delta = TIME_CACHE
        max_score = now - sec_delta

    with database.atomic():
        query = _db_operation(
            RequestsQueueModel.select(RequestsQueueModel.link).where(
                RequestsQueueModel.timestamp <= max_score).order_by(
                    RequestsQueueModel.timestamp).limit(
                        MAX_POOL).execute)  # type: List[RequestsQueueModel]
        link_pool = [model.link for model in query]

        # force update records
        if TIME_CACHE is not None:
            new_score = (now + sec_delta).timestamp()
            _save_requests_db(link_pool, score=new_score)
    return link_pool
Пример #3
0
def _save_selenium_db(entries: typing.List[Link],
                      single: bool = False,
                      score=None,
                      nx=False,
                      xx=False):
    """Save link to the :mod:`selenium` database.

    The function updates the :class:`~darc.model.tasks.selenium.SeleniumQueueModel` table.

    Args:
        entries: Links to be added to the :mod:`selenium` database.
            It can be either a :obj:`list` of links, or a single
            link string (if ``single`` set as :data:`True`).
        single: Indicate if ``entries`` is a :obj:`list` of links
            or a single link string.
        score: Score to for the Redis sorted set.
        nx: Only create new elements and not to
            update scores for elements that already exist.
        xx: Only update scores of elements that
            already exist. New elements will not be added.

    """
    if not entries:
        return
    if score is None:
        score = datetime.datetime.now()

    if not single:
        if nx:
            with database.atomic():
                insert_many = [
                    dict(
                        text=link.url,
                        hash=link.name,
                        link=link,
                        timestamp=score,
                    ) for link in entries
                ]
                for batch in peewee.chunked(insert_many, BULK_SIZE):
                    (SeleniumQueueModel.insert_many(
                        insert_many).on_conflict_ignore().execute())
            return

        if xx:
            entries_text = [link.url for link in entries]
            (SeleniumQueueModel.update(timestamp=score).where(
                SeleniumQueueModel.text.in_(entries_text)).execute())
            return

        with database.atomic():
            replace_many = [
                dict(text=link.url, hash=link.name, link=link, timestamp=score)
                for link in entries
            ]
            for batch in peewee.chunked(replace_many, BULK_SIZE):
                SeleniumQueueModel.replace_many(batch).execute()
        return

    if nx:
        SeleniumQueueModel.get_or_create(text=entries.url,
                                         defaults=dict(
                                             hash=entries.name,
                                             link=entries,
                                             timestamp=score,
                                         ))
        return

    if xx:
        with contextlib.suppress(peewee.DoesNotExist):
            model = SeleniumQueueModel.get(
                SeleniumQueueModel.text == entries.url)
            model.timestamp = score
            model.save()
        return

    SeleniumQueueModel.replace(text=entries.url,
                               hash=entries.name,
                               link=entries,
                               timestamp=score).execute()
Пример #4
0
def _save_requests_db(entries: typing.Union[Link, typing.List[Link]], single: bool = False,
                      score: typing.Optional[float] = None, nx: bool = False, xx: bool = False) -> None:
    """Save link to the :mod:`requests` database.

    The function updates the :class:`~darc.model.tasks.requests.RequestsQueueModel` table.

    Args:
        entries: Links to be added to the :mod:`requests` database.
            It can be either a :obj:`list` of links, or a single
            link string (if ``single`` set as :data:`True`).
        single: Indicate if ``entries`` is a :obj:`list` of links
            or a single link string.
        score: Score to for the Redis sorted set.
        nx: Only create new elements and not to
            update scores for elements that already exist.
        xx: Only update scores of elements that
            already exist. New elements will not be added.

    """
    if not entries:
        return
    if score is None:
        score = datetime.datetime.now()  # type: ignore

    if not single:
        if typing.TYPE_CHECKING:
            entries = typing.cast(typing.List[Link], entries)

        if nx:
            with database.atomic():
                insert_many = [dict(
                    text=link.url,
                    hash=link.name,
                    link=link,
                    timestamp=score,
                ) for link in entries]
                for batch in peewee.chunked(insert_many, BULK_SIZE):
                    _db_operation(RequestsQueueModel
                                  .insert_many(insert_many)
                                  .on_conflict_ignore()
                                  .execute)
            return

        if xx:
            entries_text = [link.url for link in entries]
            _db_operation(RequestsQueueModel
                          .update(timestamp=score)
                          .where(typing.cast(peewee.TextField, RequestsQueueModel.text).in_(entries_text))
                          .execute)
            return

        with database.atomic():
            replace_many = [dict(
                text=link.url,
                hash=link.name,
                link=link,
                timestamp=score
            ) for link in entries]
            for batch in peewee.chunked(replace_many, BULK_SIZE):
                _db_operation(RequestsQueueModel.replace_many(batch).execute)
        return

    if typing.TYPE_CHECKING:
        entries = typing.cast(Link, entries)

    if nx:
        _db_operation(RequestsQueueModel.get_or_create,
                      text=entries.url,
                      defaults=dict(
                          hash=entries.name,
                          link=entries,
                          timestamp=score,
                      ))
        return

    if xx:
        with contextlib.suppress(peewee.DoesNotExist):
            model = _db_operation(RequestsQueueModel.get, RequestsQueueModel.text == entries.url)
            model.timestamp = score
            _db_operation(model.save)
        return

    _db_operation(RequestsQueueModel.replace(
        text=entries.url,
        hash=entries.name,
        link=entries,
        timestamp=score
    ).execute)
Пример #5
0
def _save_selenium_db(entries: 'Union[Link, List[Link]]',
                      single: bool = False,
                      score: 'Optional[float]' = None,
                      nx: bool = False,
                      xx: bool = False) -> None:
    """Save link to the :mod:`selenium` database.

    The function updates the :class:`~darc.model.tasks.selenium.SeleniumQueueModel` table.

    Args:
        entries: Links to be added to the :mod:`selenium` database.
            It can be either a :obj:`list` of links, or a single
            link string (if ``single`` set as :data:`True`).
        single: Indicate if ``entries`` is a :obj:`list` of links
            or a single link string.
        score: Score to for the Redis sorted set.
        nx: Only create new elements and not to
            update scores for elements that already exist.
        xx: Only update scores of elements that
            already exist. New elements will not be added.

    """
    if not entries:
        return None
    if score is None:
        timestamp = datetime.now()
    else:
        timestamp = datetime.fromtimestamp(score)

    if not single:
        if TYPE_CHECKING:
            entries = cast('List[Link]', entries)

        if nx:
            with database.atomic():
                insert_many = [{
                    'text': link.url,
                    'hash': link.name,
                    'link': link,
                    'timestamp': timestamp,
                } for link in entries]
                for batch in peewee.chunked(insert_many, BULK_SIZE):
                    _db_operation(
                        SeleniumQueueModel.insert_many(
                            insert_many).on_conflict_ignore().execute)
            return None

        if xx:
            entries_text = [link.url for link in entries]
            _db_operation(
                SeleniumQueueModel.update(timestamp=timestamp).where(
                    cast('TextField',
                         SeleniumQueueModel.text).in_(entries_text)).execute)
            return None

        with database.atomic():
            replace_many = [{
                'text': link.url,
                'hash': link.name,
                'link': link,
                'timestamp': timestamp,
            } for link in entries]
            for batch in peewee.chunked(replace_many, BULK_SIZE):
                _db_operation(SeleniumQueueModel.replace_many(batch).execute)
        return None

    if TYPE_CHECKING:
        entries = cast('Link', entries)

    if nx:
        _db_operation(SeleniumQueueModel.get_or_create,
                      text=entries.url,
                      defaults={
                          'hash': entries.name,
                          'link': entries,
                          'timestamp': timestamp,
                      })
        return None

    if xx:
        with contextlib.suppress(peewee.DoesNotExist):
            model = _db_operation(SeleniumQueueModel.get, SeleniumQueueModel.text == entries.url)  # type: SeleniumQueueModel # pylint: disable=line-too-long
            model.timestamp = timestamp
            _db_operation(model.save)
        return None

    _db_operation(
        SeleniumQueueModel.replace(
            text=entries.url,
            hash=entries.name,
            link=entries,
            timestamp=timestamp,
        ).execute)
    return None