Пример #1
0
def agent_wrapper(url, scope):
    try:
        port, host, uri = break_url(url)
        sub = {url}
        if sub.issubset(visited):
            return set(), set()
        agent = CAgent(host, port)
        agent.uri = uri
        agent.scope = scope
        page = ""
        page = agent.request()
        visited.add(url)
        lset = get_addresses(page)
        eset = get_emails(page)
        return lset, eset
    except Exception as e:
        return set(), set()
Пример #2
0
def disgusting_image_grab(l):
    l = l.strip('"')
    l = l.replace('&', "&")
    port, host, uri = break_url(l)
    temp_agent = CAgent(host, port)
    temp_agent.uri = uri
    image = b""
    try:
        temp_resp = temp_agent.request()
    except UnicodeDecodeError:
        resp = temp_agent.request_image()
        try:
            image = resp.split(b'Content-Type: image/jpg\r\n\r\n')[1]
        except IndexError:
            image = resp.split(b'Content-Type: image/jpeg\r\n\r\n')[1]
    name = "/temp_vol/" + str(hash(image)) + ".jpg"
    with open(name, "wb") as file:
        file.write(image)
Пример #3
0
def act1_step2():
    url = "https://www.rit.edu/computing/directory?term_node_tid_depth=4919"
    port, host, uri = break_url(url)
    agent = CAgent(host, port)
    agent.uri = uri
    page = agent.request()
    visited.add(url)
    pattern = re.compile('(?:data-src=)"(https?://(\w+.)+)')
    temp_links = pattern.findall(page)
    real_links = list()
    for i in temp_links:
        real_links.append(i[0])
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=len(real_links)) as executor:
        # Start the load operations and mark each future with its URL
        future_images = {
            executor.submit(disgusting_image_grab, i): i
            for i in real_links
        }