def save_to_csv(queue: Queue, condition: Condition, finish_event: Event, path: Path): fieldnames = ( "unique_id", "filename", "title", "source", "splash_url", "sha1", "date", "file_url", "fulltext", "creation_date", "creator_tool", "creator_title", ) buffer = io.StringIO() writer = csv.DictWriter(buffer, fieldnames=fieldnames) writer.writeheader() inserted_values = 0 while not finish_event.is_set() or not queue.empty(): with condition: while queue.empty(): condition.wait() try: augmented_aptnote = queue.get_nowait() finally: queue.task_done() writer.writerow(augmented_aptnote) inserted_values += 1 with open(path, "wt") as f: print(buffer.getvalue(), file=f) relative_path = path.relative_to(Path.cwd()) logger.info( f"Downloaded, parsed, and saved {inserted_values} document(s) in {relative_path}" )
async def save_to_sqlite( queue: Queue, condition: Condition, finish_event: Event, path: Path ) -> None: async with aiosqlite.connect(path) as db: await db_init(db) inserted_values = 0 while not finish_event.is_set() or not queue.empty(): with condition: while queue.empty(): condition.wait() try: augmented_aptnote = queue.get_nowait() except Exception as e: logger.error(e) finally: queue.task_done() await insert_values(db, augmented_aptnote) await db.commit() inserted_values += 1 relative_path = path.relative_to(Path.cwd()) logger.info( f"Downloaded, parsed, and saved {inserted_values} document(s) in {relative_path}" )
def valuation(self): my_value = 0 enemy_value = 0 queues = list() positions = list(filter(lambda player: player != DEAD, self.players)) saved_nodes = [[content for content in row] for row in self.nodes] for cell in positions: queue = Queue() queue.put_nowait(cell) queues.append(queue) while not all(map(lambda queue: queue.empty(), queues)): for player, queue in enumerate(queues): if queue.empty(): continue if self.players[player][0] == -1: item_row, item_col = queue.get_nowait() self.nodes[item_row][item_col] = FREE continue symbol = PLAYERS_SYMBOLS[player] item_row, item_col = queue.get_nowait() for _, neighbor_row, neighbor_col in self.free_neighbors(item_row, item_col): self.nodes[neighbor_row][neighbor_col] = symbol queue.put_nowait((neighbor_row, neighbor_col)) if player == self.me: my_value += 1 else: enemy_value += 1 self.nodes = saved_nodes return (my_value - enemy_value) / (my_value + enemy_value + 1)
async def run(self, connection: OptolinkConnection, command_queue: asyncio.Queue): connection.flush() while True: # poll start bytes (0x05) and discard them byte = await connection.read() if byte[0] != 0x05: # we are not in synchronization phase and received a byte other than # the synchronization byte -> just wait for the next byte continue # when there is at least one command waiting in the queue, start the communication if not command_queue.empty(): connection.write(b"\x01") # TODO: start measuring utilization here try: while True: cmd, fut = await asyncio.wait_for(command_queue.get(), timeout=0.5) connection.write(cmd.get_command_bytes()) val = await connection.read( cmd.get_expected_bytes_count()) if all(it == 0x05 for it in val): fut.set_exception(Exception("Command failed")) # we must synchronize again break else: if fut.done(): print(fut.result()) raise Exception("Future was already done") fut.set_result(cmd.handle_result(val)) except asyncio.TimeoutError: continue finally: pass
async def ffmpeg(task_queue: asyncio.Queue, task_id: asyncio.Queue): """ function to process video using queue. """ assert isinstance(FFMPEG, str) while not task_queue.empty(): try: task = await task_queue.get() process_id = await task_id.get() cmd = process_input(task['input'], task['output'], task['rate'], task['fps'], task['res']) proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) print('=' * 20 + ' process {}: Converting file {} to output file {} '.format( process_id, task['input'], task['output']) + '=' * 20) proc.communicate() ret = proc.returncode if ret != 0: print( '=' * 20 + ' process {}: Failed to converting file {} | return code {} ' .format(process_id, task['input'], ret) + '=' * 20) else: print('=' * 20 + ' process {}: Completed converting file {} '.format( process_id, task['input']) + '=' * 20) print(' Done ') task_queue.task_done() task_id.task_done() except queue.Empty: print("no task") break
class EventManager(object): event_queue = None _subscribers = None def __init__(self): self.event_queue = Queue() self._subscribers = defaultdict(set) def subscribe(self, event_type, subscriber): logger.debug('subscribe %s for %s', subscriber, event_type) self._subscribers[event_type].add(subscriber) def unsubscribe(self, event_type, subscriber): #TODO: wtf? where is the code?! try: self._subscribers[event_type].remove(subscriber) except ValueError: pass def add_event(self, event): logger.debug('put event %s into queue', event) self.event_queue.put_nowait(event) # TODO: WTF? NOWAIT? async def process_events(self): while True: # logger.debug('Processing %s events', self.event_queue.qsize()) while not self.event_queue.empty(): event = await self.event_queue.get() logger.debug('processing event %s', event) subscribers_list = self._subscribers.get(event['channel'], []) for subscriber in subscribers_list: await subscriber.process_event(event) if not subscribers_list: logger.debug('no listeners for event %s', event) await asyncio.sleep(0.1)
async def parse(item_links: asyncio.Queue, sess: ClientSession, items: asyncio.Queue, prox: str): start = True while not item_links.empty() or start: start = False item = inst['_base'] + await item_links.get() for _ in range(inst['_retry']): await asyncio.sleep(1.5) try: async with sess.get(item, headers=inst['_headers'], proxy=prox, proxy_auth=auth) as resp: txt = await resp.text() page = fs(txt) res = {'url': item} for k, v in inst['fields'].items(): val = page.xpath(v['path']) res[k] = v['type'](val) if val else None table = page.xpath(inst['table']['home']) for t in table: for k, v in zip(t.xpath(inst['table']['title']), t.xpath(inst['table']['value'])): res[k] = v if res['name'] is None: await item_links.put(item[len(inst['_base']):]) else: print(datetime.now()) await items.put(res) break except: await item_links.put(item[len(inst['_base']):]) continue
def breadthfirst(bt): """breadthfirst: binary tree -> list[Node] Purpose: Runs a breadth first search on a binary tree Consumes: a binary tree object Produces: a list of Nodes in breadth first search order Example: A breadthfirst( / \ ) -> [A B C] B C If tree is empty, should return an empty list. If the tree is null, you should throw InvalidInputException. """ if bt is None: raise InvalidInputException("Input is None") if bt.isEmpty(): return [] Q = Queue() qlist = [] qlist.append(bt.root()) Q.put(bt.root()) while not Q.empty(): node = Q.get() if bt.hasLeft(node): Q.put(bt.left(node)) qlist.append(bt.left(node)) if bt.hasRight(node): Q.put(bt.right(node)) qlist.append(bt.right(node)) return qlist
class Sender(): def __init__(self, name: str): self.name = name self.frames = None self.queue = Queue() async def send(self, frames: list, channel: 'Channel'): await asyncio.sleep(1) self.frames = frames await channel.connect(self) print(self.name, " :\tFrames received form network layer!") for i in range(0, len(frames)): copy = self.frames[i] # print(copy) if self.queue.empty(): await self.queue.put(copy) print(self.name, " :\tFrame sent in the channel!") try: await asyncio.wait_for(channel.transmit(self), timeout=10) val = await asyncio.wait_for(self.queue.get(), timeout=10) print(self.name, " :\thas received", val, "from", val.source_address) except asyncio.TimeoutError as e: print(self.name, " :\tTimed Out!") await asyncio.sleep(3)
async def work(self, queue: asyncio.Queue, client_session: aiohttp.ClientSession) -> List[Any]: while not queue.empty(): await self.rate_limiter.acquire() tasks = self.pick_tasks(queue, self.rate_limiter.rate_limit) futures = [] for i, task in enumerate(tasks): future = self.backend.fetch_offer_async(client_session, task) futures.append(future) self.work_index[i] = task self.counter = self.counter + 1 done = await asyncio.gather(*futures, return_exceptions=True) for idx, result in enumerate(done): if isinstance(result, Exception): failed_task = self.work_index[idx] logging.debug("{}: Reschedule task: {} -> {}".format(self.backend.name(), failed_task.have, failed_task.want)) logging.debug(result) queue.put_nowait(failed_task) self.counter = self.counter - 1 self.just_failed = True else: self.results.extend(result) self.work_index.clear() await self.handle_error() return self.results
async def _loop_manager(self, *, wait_time: int, state_change_queue: Queue) -> None: start = 0 running = True while running: if (time.time() - start) > wait_time: try: await self._load_publications() except: # pylint: disable=bare-except self._logger.error(traceback.format_exc()) self._logger.debug(f"Waiting {wait_time} seconds", ) start = time.time() else: await asyncio.sleep(self._WAIT_TIME) self._logger.debug( f"Remains {int(wait_time - (time.time() - start))} seconds, to execute the task." ) if state_change_queue.empty(): self._logger.debug("No new state.") else: new_state: State = state_change_queue.get_nowait() if new_state == State.STOP: running = False else: raise NotImplementedError await self._close() self._logger.info("Shutdown")
def fetch_url(self, work_queue: asyncio.Queue): gecko_driver = self.option_register.get_register('gecko_driver') while work_queue.empty() is not True: url: str = work_queue.get_nowait() url = url.replace("alert(1)", f"alert({self.random_int})") opts = Options() opts.headless = True driver = webdriver.Firefox(options=opts, executable_path=gecko_driver) try: driver.get(url) WebDriverWait(driver, 5).until(ec.alert_is_present()) alert = driver.switch_to.alert if str(self.random_int) in alert.text: self.print_queue.put_nowait(('success', f"{url}")) with open(f"xss_report.txt", 'w') as f: f.write(f"SUCCESS --> {url}") else: self.print_queue.put_nowait(('warning', f"{url}")) with open(f"xss_report", 'w') as f: f.write(f"POSSIBLY --> {url}") alert.accept() except TimeoutException: self.print_queue.put_nowait(('error', f"{url}")) except (Exception, KeyboardInterrupt) as e: print(f"ERROR:ERROR {e.__str__()}") raise KeyboardInterrupt finally: driver.quit()
def fetch_url(self, work_queue: asyncio.Queue, headers, listener=None, placeholder=None): while work_queue.empty() is not True: url: str = work_queue.get_nowait() if placeholder and listener: url = url.replace(placeholder, listener) self.print_queue.put_nowait(('bold', f"Testing {url}")) try: with requests.Session() as session: retry = Retry(connect=3, backoff_factor=1, status_forcelist=[429, 504]) adapter = HTTPAdapter(max_retries=retry, pool_connections=200, pool_maxsize=200) session.mount('http://', adapter=adapter) session.mount('https://', adapter=adapter) session.get(url, headers=headers, timeout=5) except requests.RequestException as e: self.print_queue.put_nowait(('error', f"{e.__str__()}\n"))
async def async_queue_reader(async_queue: asyncio.Queue, event: Event): """Checks the async queue for a message and if it exists prints it Think of this as our sink. Args: async_queue: event: Kill signal """ while True and not event.is_set(): try: """ This part was hard to get right. First, you don't await get_nowait() unlike get(), I guess this is because no wait assumes the value is immediately available or throws. """ msg = async_queue.get_nowait() logging.debug(f"async_queue_reader received: {msg}") async_queue.task_done() except QueueEmpty as err: """ If our Queue is empty go back to sleep and check the event again. """ pass finally: """ We always want to sleep so the shut down event is continuously checked. """ await asyncio.sleep(0) logging.debug('async_queue_reader shutting down!') assert async_queue.empty() # I am curious if this always passes.
async def emit(q: asyncio.Queue) -> None: print('adding to queue') await q.put(True) while not q.empty(): print('looping') await asyncio.sleep(1) print('exiting emit')
class Clix: def __init__(self, creator: Callable): self._creator = creator self._queue = Queue() self._executor = ProcessPoolExecutor() self._loop = None def reform(self, mapper: Callable, predicate: Callable = notnull) -> 'Clix': self._queue.put_nowait( lambda iterable: filter_map(iterable, mapper, predicate)) return self def map(self, mapper: Callable) -> 'Clix': self._queue.put_nowait(lambda iterable: gather(*( self.__execute(mapper, i) for i in iterable))) return self def __execute(self, function: Callable, *args: Any) -> Generator: return self._loop.run_in_executor(self._executor, function, *args) def flatten(self, flattener: Callable = (lambda v: v)) -> 'Clix': self._queue.put_nowait( lambda iterable: self.__flatten(iterable, flattener)) return self @staticmethod async def __flatten(iterable: Iterable, flattener: Callable) -> Generator: return (i for si in iterable for i in flattener(si)) def distinct(self, keymaker: Callable) -> 'Clix': self._queue.put_nowait( lambda iterable: self.__distinct(iterable, keymaker)) return self @staticmethod async def __distinct(iterable: Iterable, keymaker: Callable) -> ValuesView: return {keymaker(i): i for i in iterable}.values() def sieve(self, mapper: Callable, predicate: Callable = notnull) -> 'Clix': return self.reform(lambda i: self.__execute(mapper, i), predicate) async def apply(self, applier: Callable) -> Iterable: self._loop = get_event_loop() iterable = await self._creator() while not self._queue.empty(): function = await self._queue.get() iterable = await function(iterable) iterable = await gather(*(map(applier, iterable))) self._executor.shutdown() return iterable async def list(self) -> List[Any]: iterable = await self.apply(self.__skip) return list(iterable) @staticmethod async def __skip(value: Any) -> Any: return value
async def save_to_files( queue: Queue, condition: Condition, finish_event: Event, directory: Path ) -> None: directory.mkdir(parents=True, exist_ok=True) while not finish_event.is_set() or not queue.empty(): with condition: while queue.empty(): condition.wait() try: buffer, aptnote = queue.get_nowait() finally: queue.task_done() await write_file(buffer, directory, aptnote["filename"]) relative_path = directory.relative_to(Path.cwd()) no_of_files = len(list(directory.iterdir())) logger.info(f"Downloaded and saved {no_of_files} file(s) in {relative_path}")
async def download_packages_from_queue(queue: asyncio.Queue, context: Context): while not queue.empty(): package: DownloadPackageBody = queue.get_nowait() await download_package(package_name=package.packageName, version=package.version, context=context) queue.task_done()
async def worker(name: str, queue: Queue): while not queue.empty(): sleep_for = await queue.get() print(f'{name} is started') await asyncio.sleep(sleep_for) queue.task_done() print(f'{name} has work for {sleep_for:.2f} seconds')
async def iter_queue(q: asyncio.Queue): while True: if q.empty(): print("queue is empty... sleeping") yield await asyncio.sleep(0.1) else: print("queue has events... processing") yield await q.get()
def save_to_json(queue: Queue, condition: Condition, finish_event: Event, path: Path): aptnotes = [] while not finish_event.is_set() or not queue.empty(): with condition: while queue.empty(): condition.wait() try: augmented_aptnote = queue.get_nowait() finally: queue.task_done() aptnotes.append(augmented_aptnote) with open(path, "wt") as f: json.dump(aptnotes, f, sort_keys=True, indent=2) relative_path = path.relative_to(Path.cwd()) logger.info( f"Downloaded, parsed, and saved {len(aptnotes)} document(s) in {relative_path}" )
async def listify_queue(queue: asyncio.Queue): item_list = [] while not queue.empty(): item = await queue.get() item_list.append(item) for item in item_list: await queue.put(item) return item_list
async def task(name: str, work_queue: asyncio.Queue): timer = Timer(text=f"Task {name} elapsed time: {{:.1f}}") while not work_queue.empty(): delay = await work_queue.get() print(f"Task {name} is running") timer.start() await asyncio.sleep(delay) timer.stop()
def clean_queue(queue: asyncio.Queue): while not queue.empty(): try: queue.get_nowait() except asyncio.QueueEmpty: break # cancel all coroutines waiting for pkt for getter in queue._getters: # type: ignore getter.cancel()
class FileWriter: def __init__(self, torrent, loop): self.queue = Queue() self.torrent = torrent self.loop = loop self.path = torrent.get_info().file_name() self.is_done = False self.memory = {} files = glob.glob('pieces/*') for f in files: os.remove(f) def add_piece(self, piece): self.queue.put_nowait(piece) async def worker(self): while not self.is_done or not self.queue.empty(): piece = await self.queue.get() await self._write_piece(piece) self.queue.task_done() await self._combine_pieces() self.loop.stop() async def _write_piece(self, piece): await self.loop.run_in_executor( _executor, self._writer, piece.piece, piece.get_data() ) self.memory[piece.piece] = True async def _combine_pieces(self): with open(self.path, 'wb') as dest: keys = list(sorted(self.memory.keys())) for key in keys: src = open(f'pieces/{key}', 'rb').read() dest.write(src) files = glob.glob('pieces/*') for f in files: os.remove(f) def _writer(self, piece, data): file = open(f'pieces/{piece}', 'wb') file.write(data) file.close() async def finish_writing(self): self.is_done = True print('\nWriting to file')
async def company_q_consumer(q: asyncio.Queue, result_lis: List): while not q.empty(): linkedin_url = await q.get() try: company_data = await get_coy_profile(linkedin_url) result_lis += [[linkedin_url, company_data]] except Exception: print(f"Cannot enrich {linkedin_url}.") pass
def subscriber1(q: asyncio.Queue, *args): print('subscriber 1 received event:') time.sleep(2) print('sub 1: ' + args[0]) print(q.empty()) print(q.qsize()) msg = q.get_nowait() print(msg)
class Buffer: def __init__(self, ack_callback, *, loop): self._ack_callback = ack_callback self._eof = False self._unacked = Queue(loop=loop) self._acked = deque() self._acked_size = 0 def add(self, data, ack_size): self._unacked.put_nowait(UnackedData(data, len(data), ack_size)) def eof(self): self._unacked.put_nowait(UnackedData(b'', 0, 0)) self._eof = True async def read(self, size): assert size >= 0, 'Size can not be negative' if size == 0: return b'' if not self._eof or not self._unacked.empty(): while self._acked_size < size: data, data_size, ack_size = await self._unacked.get() if not ack_size: break self._acked.append(AckedData(memoryview(data), data_size)) self._acked_size += data_size self._ack_callback(ack_size) if self._eof and self._acked_size == 0: return b'' if self._acked_size < size: raise AssertionError('Received less data than expected') chunks = [] chunks_size = 0 while chunks_size < size: next_chunk, next_chunk_size = self._acked[0] if chunks_size + next_chunk_size <= size: chunks.append(next_chunk) chunks_size += next_chunk_size self._acked.popleft() else: offset = size - chunks_size chunks.append(next_chunk[:offset]) chunks_size += offset self._acked[0] = (next_chunk[offset:], next_chunk_size - offset) self._acked_size -= size assert chunks_size == size return b''.join(chunks) def unacked_size(self): return sum(self._unacked.get_nowait().ack_size for _ in range(self._unacked.qsize()))
async def task(name, writer, queue: asyncio.Queue): while not queue.empty(): url: str = await queue.get() url = url.strip() logging.info(f"Start check {url} on {name}") result = await checker(url) writer.writerow([ result["url"], result["hostname"], result["port"], result["result"], result["desc"], result["validityExpires"] ])
async def task(name: str, work_queue: asyncio.Queue): timer = Timer(text=f"Task {name} elapsed time: {{:.1f}}") async with aiohttp.ClientSession() as session: while not work_queue.empty(): url = await work_queue.get() print(f"Task {name} getting URL: {url}") timer.start() async with session.get(url) as response: await response.text() timer.stop()
class Cloner(object): def __init__(self, root, max_depth, css_validate): self.visited_urls = [] self.root, self.error_page = self.add_scheme(root) self.max_depth = max_depth self.moved_root = None if len(self.root.host) < 4: sys.exit('invalid target {}'.format(self.root.host)) self.target_path = '/opt/snare/pages/{}'.format(self.root.host) if not os.path.exists(self.target_path): os.mkdir(self.target_path) self.css_validate = css_validate self.new_urls = Queue() self.meta = {} self.logger = logging.getLogger(__name__) @staticmethod def add_scheme(url): if url[-1] == '/': url = url.strip('/') if yarl.URL(url).scheme: new_url = yarl.URL(url) err_url = yarl.URL(url + '/status_404') else: new_url = yarl.URL('http://' + url) err_url = yarl.URL('http://' + url + '/status_404') return new_url, err_url async def process_link(self, url, level, check_host=False): try: url = yarl.URL(url) except UnicodeError: return None if url.scheme == ("data" or "javascript" or "file"): return url.human_repr() if not url.is_absolute(): if self.moved_root is None: url = self.root.join(url) else: url = self.moved_root.join(url) host = url.host if check_host: if (host != self.root.host and self.moved_root is None) or \ url.fragment or \ (self.moved_root is not None and host != self.moved_root.host): return None if url.human_repr() not in self.visited_urls and (level + 1) <= self.max_depth: await self.new_urls.put((url, level + 1)) res = None try: res = url.relative().human_repr() except ValueError: self.logger.error(url) return res async def replace_links(self, data, level): soup = BeautifulSoup(data, 'html.parser') # find all relative links for link in soup.findAll(href=True): res = await self.process_link(link['href'], level, check_host=True) if res is not None: link['href'] = res # find all images and scripts for elem in soup.findAll(src=True): res = await self.process_link(elem['src'], level) if res is not None: elem['src'] = res # find all action elements for act_link in soup.findAll(action=True): res = await self.process_link(act_link['action'], level) if res is not None: act_link['action'] = res # prevent redirects for redir in soup.findAll(True, attrs={'name': re.compile('redirect.*')}): if redir['value'] != "": redir['value'] = yarl.URL(redir['value']).relative().human_repr() return soup def _make_filename(self, url): host = url.host if url.is_absolute(): file_name = url.relative().human_repr() else: file_name = url.human_repr() if not file_name.startswith('/'): file_name = "/" + file_name if file_name == '/' or file_name == "": if host == self.root.host or (self.moved_root is not None and self.moved_root.host == host): file_name = '/index.html' else: file_name = host m = hashlib.md5() m.update(file_name.encode('utf-8')) hash_name = m.hexdigest() return file_name, hash_name async def get_body(self, session): while not self.new_urls.empty(): current_url, level = await self.new_urls.get() if current_url.human_repr() in self.visited_urls: continue self.visited_urls.append(current_url.human_repr()) file_name, hash_name = self._make_filename(current_url) print('name: ', file_name) self.meta[file_name] = {} data = None content_type = None try: response = await session.get(current_url, headers={'Accept': 'text/html'}, timeout=10.0) content_type = response.content_type data = await response.read() except (aiohttp.ClientError, asyncio.TimeoutError) as client_error: self.logger.error(client_error) else: await response.release() if data is not None: self.meta[file_name]['hash'] = hash_name self.meta[file_name]['content_type'] = content_type if content_type == 'text/html': soup = await self.replace_links(data, level) data = str(soup).encode() with open(os.path.join(self.target_path, hash_name), 'wb') as index_fh: index_fh.write(data) if content_type == 'text/css': css = cssutils.parseString(data, validate=self.css_validate) for carved_url in cssutils.getUrls(css): if carved_url.startswith('data'): continue carved_url = yarl.URL(carved_url) if not carved_url.is_absolute(): carved_url = self.root.join(carved_url) if carved_url.human_repr() not in self.visited_urls: await self.new_urls.put((carved_url, level + 1)) async def get_root_host(self): try: async with aiohttp.ClientSession() as session: resp = await session.get(self.root) if resp.host != self.root.host: self.moved_root = resp.url resp.close() except aiohttp.ClientError as err: self.logger.error("Can\'t connect to target host: %s", err) exit(-1) async def run(self): session = aiohttp.ClientSession() try: await self.new_urls.put((self.root, 0)) await self.new_urls.put((self.error_page, 0)) await self.get_body(session) except KeyboardInterrupt: raise finally: with open(os.path.join(self.target_path, 'meta.json'), 'w') as mj: json.dump(self.meta, mj) await session.close()
class Cloner(object): def __init__(self, root): self.visited_urls = [] self.root = self.add_scheme(root) if len(self.root.host) < 4: sys.exit('invalid taget {}'.format(self.root.host)) self.target_path = '/opt/snare/pages/{}'.format(self.root.host) if not os.path.exists(self.target_path): os.mkdir(self.target_path) self.new_urls = Queue() @staticmethod def add_scheme(url): if yarl.URL(url).scheme: new_url = yarl.URL(url) else: new_url = yarl.URL('http://' + url) return new_url @asyncio.coroutine def process_link(self, url, check_host=False): url = yarl.URL(url) if check_host: if (url.host != self.root.host or url.fragment or url in self.visited_urls): return None if not url.is_absolute(): url = self.root.join(url) yield from self.new_urls.put(url) return url.relative().human_repr() @asyncio.coroutine def replace_links(self, data): soup = BeautifulSoup(data, 'html.parser') # find all relative links for link in soup.findAll(href=True): res = yield from self.process_link(link['href'], check_host=True) if res is not None: link['href'] = res # find all images and scripts for elem in soup.findAll(src=True): res = yield from self.process_link(elem['src']) if res is not None: elem['src'] = res # find all action elements for act_link in soup.findAll(action=True): res = yield from self.process_link(act_link['action']) if res is not None: act_link['action'] = res # prevent redirects for redir in soup.findAll(True, attrs={'name': re.compile('redirect.*')}): redir['value'] = yarl.URL(redir['value']).relative().human_repr() return soup @asyncio.coroutine def get_body(self): while not self.new_urls.empty(): current_url = yield from self.new_urls.get() if current_url in self.visited_urls: continue self.visited_urls.append(current_url) if current_url.name: file_name = current_url.name elif current_url.raw_path != '/': file_name = current_url.path.rsplit('/')[1] else: file_name = 'index.html' file_path = os.path.dirname(current_url.path) if file_path == '/': file_path = self.target_path else: file_path = os.path.join(self.target_path, file_path[1:]) print('path: ', file_path, 'name: ', file_name) if file_path and not os.path.exists(file_path): os.makedirs(file_path) data = None try: with aiohttp.Timeout(10.0): with aiohttp.ClientSession() as session: response = yield from session.get(current_url) data = yield from response.read() except aiohttp.ClientError as client_error: print(client_error) else: response.release() session.close() if data is not None: if re.match(re.compile('.*\.(html|php)'), file_name): soup = yield from self.replace_links(data) data = str(soup).encode() with open(os.path.join(file_path, file_name), 'wb') as index_fh: index_fh.write(data) if '.css' in file_name: css = cssutils.parseString(data) for carved_url in cssutils.getUrls(css): if carved_url.startswith('data'): continue carved_url = yarl.URL(carved_url) if not carved_url.is_absolute(): carved_url = self.root.join(carved_url) if carved_url not in self.visited_urls: yield from self.new_urls.put(carved_url) @asyncio.coroutine def run(self): yield from self.new_urls.put(self.root) return (yield from self.get_body())