Exemplo n.º 1
0
def simulate_short_term_fetch(
        url_frontier_list: pyd.Frontier) -> List[pyd.Url]:
    crawl_delay = (local.load_setting("default_crawl_delay")
                   if url_frontier_list.fqdn_crawl_delay is None else
                   url_frontier_list.fqdn_crawl_delay)
    simulated_crawl_delay = crawl_delay / local.load_setting(
        "crawling_speed_factor")

    session = requests.Session()
    cumulative_parsed_list = []

    for url in url_frontier_list.url_list:
        cumulative_parsed_list.extend(simulate_parse_url(url, session))
        sleep(simulated_crawl_delay)

    logger.info("Short Term Frontier processed. FQDN {}, URLs {}".format(
        url_frontier_list.fqdn, url_frontier_list.fqdn_url_count))
    return cumulative_parsed_list
Exemplo n.º 2
0
def simulate_parse_url(url: pyd.Url,
                       session: requests.Session) -> List[pyd.Url]:
    url.url_last_visited = datetime.now()

    parsed_list = [url]

    simulated_link_amount = random.randint(
        local.load_setting("min_links_per_page"),
        local.load_setting("max_links_per_page"),
    )

    logger.debug("{} Next URL: {}".format(mp.current_process(), url.url))

    for _ in range(simulated_link_amount):
        internal_external_rand = random.random()
        known_unknown_rand = random.random()

        if new_internal_cond(internal_external_rand, known_unknown_rand):
            new_url = generate_new_internal_url(url)
            logger.debug("{} New Internal URL: {}".format(
                mp.current_process(), new_url.url))
            parsed_list.append(new_url)

        if existing_internal_cond(internal_external_rand, known_unknown_rand):
            new_url = generate_existing_internal_url(url=url)
            logger.debug("{} Existing Internal URL: {}".format(
                mp.current_process(), new_url.url))
            parsed_list.append(new_url)

        if new_external_cond(internal_external_rand, known_unknown_rand):
            new_url = gen.generate_random_url()
            logger.debug("{} New External URL: {}".format(
                mp.current_process(), new_url.url))
            parsed_list.append(new_url)

        if existing_external_cond(internal_external_rand, known_unknown_rand):
            new_url = generate_existing_external_url(session=session)
            logger.debug("{} Existing External URL: {}".format(
                mp.current_process(), new_url.url))
            parsed_list.append(new_url)

    return parsed_list
Exemplo n.º 3
0
def simulate_full_fetch(long_term_frontier: pyd.FrontierResponse):

    logger.debug("Long Term Frontier: {}".format(long_term_frontier))

    fqdn_pool = mp.Pool(processes=local.load_setting("parallel_process"))
    url_frontier_list = fqdn_pool.map(simulate_fqdn_parse,
                                      long_term_frontier.url_frontiers)
    fqdn_pool.close()

    logger.debug("URL Frontier List ins: {}".format(url_frontier_list))

    url_pool = mp.Pool(processes=local.load_setting("parallel_process"))
    url_data = url_pool.map(simulate_short_term_fetch, url_frontier_list)
    url_pool.close()

    flat_url_data = [url for url_list in url_data for url in url_list]

    logger.debug("Url Data: {}".format(flat_url_data))

    all_new_fqdns = fqdns_from_url_list(flat_url_data)
    extended_url_frontier_list = unique_fqdn_list(url_frontier_list,
                                                  all_new_fqdns)

    processed_frontier = pyd.FrontierResponse(
        uuid=long_term_frontier.uuid,
        response_url=long_term_frontier.response_url,
        latest_return=long_term_frontier.latest_return,
        url_frontiers_count=long_term_frontier.url_frontiers_count,
        urls_count=long_term_frontier.urls_count,
        url_frontiers=extended_url_frontier_list,
    )

    return pyd.SimulatedParsedList(
        uuid=long_term_frontier.uuid,
        fqdn_count=len(processed_frontier.url_frontiers),
        fqdns=processed_frontier.url_frontiers,
        url_count=len(flat_url_data),
        urls=flat_url_data,
    )
Exemplo n.º 4
0
def existing_external_cond(internal_vs_external_randomness,
                           known_vs_unknown_randomness):
    return internal_vs_external_randomness >= local.load_setting(
        "internal_vs_external_threshold"
    ) and known_vs_unknown_randomness >= local.load_setting(
        "new_vs_existing_threshold")
Exemplo n.º 5
0
def main():
    i = 0
    websch.init_fetcher_settings()

    uuid = websch.init_fetcher()
    ec2_instance_id = websch.get_instance_id()

    if not os.path.exists(s.log_dir):
        os.makedirs(s.log_dir)

    logger = logging.getLogger("FETSIM")
    logger.setLevel(local.load_setting("logging_mode"))

    fh = logging.FileHandler("{}/{}.log".format(s.log_dir, ec2_instance_id))
    fh.setLevel(local.load_setting("logging_mode"))

    ch = logging.StreamHandler()
    ch.setLevel(local.load_setting("logging_mode"))

    formatter = logging.Formatter(
        "%(asctime)s %(name)s %(levelname)s %(message)s")
    fh.setFormatter(formatter)
    ch.setFormatter(formatter)

    logger.addHandler(fh)
    logger.addHandler(ch)

    logger.info("Fetcher Settings: {}".format(local.load_all_settings()))

    while i < local.load_setting("iterations"):
        times = {"begin": time.time()}
        frontier_response = websch.get_frontier_partition(uuid)
        times["frontier_loaded"] = time.time()

        logger.info("Frontier Stats: {} FQDNs, {} URLs".format(
            frontier_response.url_frontiers_count,
            sum([
                url_frontier.fqdn_url_count
                for url_frontier in frontier_response.url_frontiers
            ]),
        ))
        for url_frontier in frontier_response.url_frontiers:
            logger.debug("Frontier {} URL Amount: {}".format(
                url_frontier.fqdn, url_frontier.fqdn_url_count))

        times["fetch_begin"] = time.time()
        cpu_time_before = time.process_time()
        simulated_urls = fetch.simulate_full_fetch(frontier_response)
        times["fetch_finished"] = time.time()
        cpu_time = time.process_time() - cpu_time_before

        logger.info("Response Stats: {} FQDNs, {} URLs".format(
            simulated_urls.fqdn_count, simulated_urls.url_count))

        times["submission_begin"] = time.time()
        datsav.submit_processed_list(
            submission_endpoint=frontier_response.response_url,
            submit_list=simulated_urls,
        )
        times["submission_finished"] = time.time()

        logger.info(
            "Iteration Stats: "
            "iter_load_duration: {} s, "
            "iter_fetch_start: {}, "
            "iter_fetch_duration: {} s, "
            "iter_fetch_cpu_time: {} s, "
            "iter_submit_duration: {} s".format(
                round((times["frontier_loaded"] - times["begin"]), 3),
                time.strftime(
                    "%Y-%m-%d %H:%M:%S.{}{}".format(
                        int(times["fetch_begin"] * 1000) % 1000, "000"),
                    time.gmtime(times["fetch_begin"]),
                ),
                round((times["fetch_finished"] - times["fetch_begin"]), 3),
                round(cpu_time, 3),
                round(
                    (times["submission_finished"] - times["submission_begin"]),
                    3),
            ))

        time.sleep(5)
        db_stats = websch.get_db_stats()
        logger.info("DB Stats: "
                    "db_frontier_amount: {}, "
                    "db_url_amount: {}, "
                    "db_avg_freshness: {}, "
                    "db_visited_ratio: {}, "
                    "db_fqdn_hash_range: {} %".format(
                        db_stats["frontier_amount"],
                        db_stats["url_amount"],
                        db_stats["avg_freshness"],
                        db_stats["visited_ratio"],
                        db_stats["fqdn_hash_range"],
                    ))

        i += 1

    s3_upload.upload()
    logger.info("Terminating Program")