예제 #1
0
def run(total_hours=24, hourly_limit=300, shuffle=False, termlist=None):
    if termlist is None:
        termlist = space.load_termlist()

    total_requests = min(int(total_hours * hourly_limit), len(termlist))
    total_time = 60 * 60 * min(total_hours, len(termlist) / hourly_limit)
    wait_time = total_time / total_requests
    daily_max_requests = hourly_limit * 24

    try:
        import os
        os.mkdir('search_results')
    except Exception as e:
        # print("could not make directory", e)
        pass

    # not sure if shuffle is needed, if so try shuffling index
    term_indices = list(termlist.index)
    if shuffle:
        # raise NotImplementedError()
        print("shuffling termlist")
        random.shuffle(term_indices)
    if len(termlist) > daily_max_requests:
        print("Warning: termlist length is", len(termlist),
              "while max daily requests will be", daily_max_requests)
    if len(termlist) > total_requests:
        print(
            f"Warning: only querying {total_requests} of {len(termlist)} total terms (not enough time specified)"
        )
    space.write_logs(
        f"querying {total_requests} terms for a minimum of {printable_time(seconds=total_time)}",
        verbose=True)

    google_img_count = 0
    baidu_img_count = 0
    google_fails = []
    baidu_fails = []
    results = ResultSetList()

    start_ts = time.time()
    for i in range(0, total_requests):
        start_iter_ts = time.time()
        try:
            term_idx = term_indices.pop()
            english_term = termlist.loc[term_idx].english
            chinese_term = termlist.loc[term_idx].chinese
        except Exception as e:
            print("out of terms", term_idx, str(e))
            break
        try:
            label = termlist.loc[term_idx].label
        except Exception as e:
            label = "automated_scraper"
        result = ResultSet(english_term, chinese_term, label)
        print(
            f'request {i}, term idx {term_idx}: "{result.combined_term()}", (label: {label})'
        )
        if not english_term:
            print("\tskipping Google for term (English term not present)")
        else:
            try:
                urls = query_google(english_term)
                # print(f"\tGoogle got {len(urls)} images")
                result.add(urls[:MAX_PICTURES_PER], GOOGLE)
            except Exception as e:
                google_fails.append(e)
                print("\tGoogle fail")
        if not chinese_term:
            print("\tskipping Baidu for term (Chinese term not present)")
        else:
            try:
                urls = query_baidu(chinese_term)
                # print(f"\tbaidu got {len(urls)} images")
                result.add(urls[:MAX_PICTURES_PER], BAIDU)
            except Exception as e:
                baidu_fails.append(e)
                print("\tBaidu fail")
        # print("done querying search engines for term", english_term)
        results.add(result)

        # account for the time the calls took
        took = time.time() - start_iter_ts
        # add in random jitter
        time_noise = random.random() * 2 - 1
        # print("adding noise to wait time", printable_time(seconds=time_noise))

        # cache results. this is a backup and not meant to be a reliable data store
        if i % 25 == 24:
            try:
                update_results(results)
                results.clear()
                google_img_count += results.wrote[GOOGLE]
                baidu_img_count += results.wrote[BAIDU]
            except Exception as e:
                import traceback
                print(
                    "failed to write search results; waiting until next attempt:",
                    e)
                exc = traceback.format_exc()
                print(str(exc))
        time.sleep(max(0, wait_time - took + time_noise))

    if results.length > 0:
        try:
            update_results(results)
            results.clear()
            google_img_count += results.wrote[GOOGLE]
            baidu_img_count += results.wrote[BAIDU]
        except Exception as e:
            import traceback
            exc = traceback.format_exc()
            print(exc)
            print("Failed to update search results, waiting 1 minute")
            time.sleep(60)
            update_results(results)
            results.clear()
            google_img_count += results.wrote[GOOGLE]
            baidu_img_count += results.wrote[BAIDU]

    space.write_logs(
        f'wrote {results.wrote["google"]} google images and {results.wrote[BAIDU]} baidu images',
        verbose=True)
    if len(baidu_fails) > 0 or len(google_fails) > 0:
        space.write_error(
            f"Baidu failures: {len(baidu_fails)}, Google failures: {len(google_fails)}"
        )
    print("took", printable_time(seconds=time.time() - start_ts))
    return (google_img_count, baidu_img_count, total_requests)
    '''
    Given a set of searches that the scraper has created, post each individually to 
    the /createSearch endpoint. spaces_interface.write_search_results should have
    created a new list in each result object of the Digital Ocean URLs
    '''
    search_term_to_id = {}
    print(f"saving {results.length} search terms")
    for term,result in results.iterterm():
        post_result = post_search(result, '192.168.0.1')
        if not post_result:
            raise Exception("failed to post result for term " + term)
        if len(result.urls[GOOGLE]) != len(result.get_datalake_urls(GOOGLE)):
            post_images(post_result["search_id"], GOOGLE, result.get_datalake_urls(GOOGLE))
        else:
            post_images(post_result["search_id"], GOOGLE, result.get_datalake_urls(GOOGLE), result.urls[GOOGLE])
        if len(result.urls[BAIDU]) != len(result.get_datalake_urls(BAIDU)):
            post_images(post_result["search_id"], BAIDU, result.get_datalake_urls(BAIDU))
        else:
            post_images(post_result["search_id"], BAIDU, result.get_datalake_urls(BAIDU), result.urls[BAIDU])
        search_term_to_id[result.combined_term()] = post_result["search_id"]
    return search_term_to_id

if __name__ == "__main__":
    from results import ResultSet, ResultSetList
    import time
    result = ResultSet('bunny', '')
    result.add(['google.com', 'bunnies.io'], GOOGLE)
    result.set_datalake_urls(['datalake.com/google.com', 'datalake.com/bunnies.io'], GOOGLE)
    results = ResultSetList()
    results.add(result)
    save_search_results(results)