Exemplo n.º 1
0
def test_simple():

    get_f = lambda: ratelimit(3, 0.1)(foo)

    assert timeit(get_f(), 1) < 0.01
    assert timeit(get_f(), 3) < 0.01
    assert timeit(get_f(), 5) > 0.2

    get_f = ratelimit(1, 0.2)(foo)
    assert timeit(get_f, 1) < 0.01
    assert timeit(get_f, 2) > 0.2
Exemplo n.º 2
0
Arquivo: truecar.py Projeto: qdbp/cars
def run_scraper(state: TruecarState, args: Namespace) -> None:

    LOG.info(f"Starting scrape with state {state}")

    if state.scrape_finished_unix < state.scrape_started_unix:
        start_mileage = state.start_mileage
    else:
        start_mileage = 1

    mileage_delta = 10
    target_total = 1000
    mileage_cap = 500_000

    conn = sql.Connection(CAR_DB)
    register(conn.close)

    state.scrape_started_unix = int(time.time())

    limiter = ratelimit(3, args.ratelimit)
    session = Session()
    insert_executor = ThreadPoolExecutor(max_workers=1)

    with session:
        while True:
            max_mileage = min(start_mileage + mileage_delta, mileage_cap)
            listings = list(
                get_listings_shard_sqlite(
                    session,
                    limiter,
                    min_mileage=start_mileage,
                    max_mileage=max_mileage,
                ))
            if len(listings) == 0 and state.start_mileage >= 500_000:
                yield state.new()
                return

            insert_executor.submit(insert_listings, listings)

            # mileage_delta < 5 is bugged on the server side
            start_mileage += mileage_delta
            mileage_delta = max(
                5, int(mileage_delta * target_total / len(listings)))

            LOG.info(
                f"Inserted {len(listings)} listings: "
                f"mileage {start_mileage}->{start_mileage + mileage_delta}")

            state.start_mileage = start_mileage
            yield state
            listings.clear()

            if max_mileage >= mileage_cap:
                break

    state.scrape_finished_unix = int(time.time())
    yield state
Exemplo n.º 3
0
def test_basic():

    with pytest.raises(ValueError):
        ratelimit(1, 0)

    with pytest.raises(ValueError):
        ratelimit(0, 1)

    # test we don't crash
    ratelimit(1, 1)(foo)()
Exemplo n.º 4
0
def test_microfuzz():

    for pool in range(1, 4):
        for n_calls in range(1, 4):
            for _ in range(5):
                timeit(ratelimit(pool, 1e-6 + random() * 1e-3)(foo), n_calls)
Exemplo n.º 5
0
def scrape(st: AutotraderState,
           args: Namespace = None) -> Generator[AutotraderState, None, None]:
    args = args or Namespace

    if args.force_restart or st.cur_min_price > 1_000_000:
        st = st.new()
        yield st

    sess = Session()
    sess.headers.update({
        "User-Agent":
        "Mozilla/5.0 (X11; Linux x86_64; rv:98.0) "
        "Gecko/20100101 Firefox/98.0"
    })

    insert_pool = ThreadPoolExecutor(max_workers=1)
    limiter = ratelimit(3, 1)

    http_get = limiter(partial(sess.get, timeout=30))

    delta = 256
    inserted = processed = 0
    while True:
        yield st

        # determine a suitable price range that will keep us under offset 1000,
        # which is the backend allowed limit
        while True:
            params = dict(
                allLisingType="USED",
                sellerTypes="d",
                searchRadius=0,
                numRecords=25,
                minPrice=st.cur_min_price,
                maxPrice=st.cur_min_price + delta,
            )

            if st.cur_shards:
                # TODO formalize/generalize parameter sector/sharding logic
                body, ofs = st.cur_shards[-1]
                params |= dict(firstRecord=ofs)
                if body:
                    params |= dict(vehicleStyleCodes=body)

            # without shards this tries to get the whole sector, which lets us know
            # how to shard
            nd = http_get(BASE_URL, params=params).json()
            tot_results = nd["totalResultCount"]
            LOG.debug(f"Got page: firstRecord={params.get('firstRecord', 0)}, "
                      f"{tot_results=}; n_shards={len(st.cur_shards)}")

            if st.cur_shards:
                break

            elif 0 < tot_results < 1000:
                st.cur_shards = [(None, ofs)
                                 for ofs in range(0, tot_results, 25)]
            elif tot_results == 0:
                st.next_sector(delta)
                yield st
            elif delta > 0:
                delta //= 2
            else:
                LOG.info(f"Still too many results, sharding by body type.")
                shards = {}
                for bt in AT_BODIES:
                    params["vehicleStyleCodes"] = bt
                    shards[bt] = http_get(
                        BASE_URL, params=params).json()["totalResultCount"]
                st.cur_shards = [(key, ofs) for key, val in shards.items()
                                 for ofs in range(0, val, 25)]

        raw_listings = prepare_listing_dict(nd)

        listings = [handle_listing(sess, ld) for ld in raw_listings]
        inserted += len([it for it in listings if it is not None])
        processed += len(listings)

        insert_pool.submit(insert_listings, listings)
        st.cur_shards.pop()

        if not st.cur_shards:
            LOG.info(
                f"Inserted {inserted} listings: "
                f"price {st.cur_min_price}->{st.cur_min_price + delta}; "
                f"{inserted/processed if processed > 0 else 1:.3f} accept rate."
            )
            st.next_sector(delta)
            inserted = processed = 0
            # aim for 750 results
            delta = int(delta * 750 / (100 + tot_results))

        if st.cur_min_price > 1_000_000:
            return