Пример #1
0
def regularsearch(area, category, sort, cache, cachedir, executor, get,
                  **kwargs):
    doc = lxml.html.fromstring(
        get(
            get_query_url(area,
                          category,
                          'search',
                          offset=0,
                          sort=sort,
                          **kwargs)))
    num_total_posts = get_num_total_posts_from_response(doc)
    num_posts_on_page = get_number_of_posts_on_current_page_from_response(doc)
    yield from get_posts_from_response(doc, area)
    for offset in range(100, num_total_posts, 100):
        doc = lxml.html.fromstring(
            get(
                get_query_url(area,
                              category,
                              'search',
                              offset=offset,
                              sort=sort,
                              **kwargs)))
        num_posts_on_page = get_number_of_posts_on_current_page_from_response(
            doc)
        yield from get_posts_from_response(doc, area)
Пример #2
0
def regularsearch(area, category, sort, cache, cachedir, executor, get,
                  **kwargs):
    doc = lxml.html.fromstring(
        get(
            get_query_url(area,
                          category,
                          'search',
                          offset=0,
                          sort=sort,
                          **kwargs)))
    num_total_posts = get_num_total_posts_from_response(doc)
    num_posts_on_page = get_number_of_posts_on_current_page_from_response(doc)
    logger.debug(
        f'downloaded first page: num_total_posts: {num_total_posts} | num_posts_on_page: {num_posts_on_page}'
    )
    yield from get_posts_from_response(doc, area)
    per_page = num_posts_on_page
    for offset in range(per_page, num_total_posts, per_page):
        doc = lxml.html.fromstring(
            get(
                get_query_url(area,
                              category,
                              'search',
                              offset=offset,
                              sort=sort,
                              **kwargs)))
        num_posts_on_page = get_number_of_posts_on_current_page_from_response(
            doc)
        yield from get_posts_from_response(doc, area)
Пример #3
0
async def jsonsearch_async(area,
                           category,
                           sort,
                           cache,
                           cachedir,
                           get,
                           as_completed=asyncio.as_completed,
                           **kwargs):
    async def process_clusters(clusters):
        futures = [
            process_cluster_url_async(cluster.url, get) for cluster in clusters
        ]
        try:
            for future in as_completed(futures):
                posts, clusters = await future
                for post in posts:
                    yield post
                async for post in process_clusters(clusters):
                    yield post
        except KeyboardInterrupt:  # pragma: no cover
            for future in futures:
                future.cancel()

    url = get_query_url(area,
                        category,
                        "jsonsearch",
                        sort=sort,
                        map=1,
                        **kwargs)
    posts, clusters = await process_cluster_url_async(url, get)

    for post in posts:
        yield post
    async for post in process_clusters(clusters):
        yield post
Пример #4
0
def jsonsearch(area,
               category,
               sort,
               cache,
               cachedir,
               executor,
               get,
               as_completed=concurrent.futures.as_completed,
               **kwargs):
    def process_clusters(clusters, executor):
        futures = (executor.submit(process_cluster_url, cluster.url, get)
                   for cluster in clusters)
        try:
            for future in as_completed(futures):
                posts, clusters = future.result()
                yield from posts
                process_clusters(clusters, executor)
        except KeyboardInterrupt:  # pragma: no cover
            for future in futures:
                future.cancel()

    url = get_query_url(area,
                        category,
                        "jsonsearch",
                        sort=sort,
                        map=1,
                        **kwargs)
    posts, clusters = process_cluster_url(url, get)
    yield from posts
    yield from process_clusters(clusters, executor)
Пример #5
0
def test_get_query_url():
    from craigslist._search import get_query_url
    assert "https://washingtondc.craigslist.org/jsonsearch/apa" in get_query_url(
        'washingtondc', 'apa', 'jsonsearch')
    with pytest.raises(ValueError) as e_info:
        get_query_url('washingtondc', 'errorerrorerror', 'jsonsearch')