def agent_wrapper(url, scope): try: port, host, uri = break_url(url) sub = {url} if sub.issubset(visited): return set(), set() agent = CAgent(host, port) agent.uri = uri agent.scope = scope page = "" page = agent.request() visited.add(url) lset = get_addresses(page) eset = get_emails(page) return lset, eset except Exception as e: return set(), set()
def disgusting_image_grab(l): l = l.strip('"') l = l.replace('&', "&") port, host, uri = break_url(l) temp_agent = CAgent(host, port) temp_agent.uri = uri image = b"" try: temp_resp = temp_agent.request() except UnicodeDecodeError: resp = temp_agent.request_image() try: image = resp.split(b'Content-Type: image/jpg\r\n\r\n')[1] except IndexError: image = resp.split(b'Content-Type: image/jpeg\r\n\r\n')[1] name = "/temp_vol/" + str(hash(image)) + ".jpg" with open(name, "wb") as file: file.write(image)
def act1_step2(): url = "https://www.rit.edu/computing/directory?term_node_tid_depth=4919" port, host, uri = break_url(url) agent = CAgent(host, port) agent.uri = uri page = agent.request() visited.add(url) pattern = re.compile('(?:data-src=)"(https?://(\w+.)+)') temp_links = pattern.findall(page) real_links = list() for i in temp_links: real_links.append(i[0]) with concurrent.futures.ThreadPoolExecutor( max_workers=len(real_links)) as executor: # Start the load operations and mark each future with its URL future_images = { executor.submit(disgusting_image_grab, i): i for i in real_links }