Exemplo n.º 1
0
def save_to_csv(queue: Queue, condition: Condition, finish_event: Event, path: Path):
    fieldnames = (
        "unique_id",
        "filename",
        "title",
        "source",
        "splash_url",
        "sha1",
        "date",
        "file_url",
        "fulltext",
        "creation_date",
        "creator_tool",
        "creator_title",
    )
    buffer = io.StringIO()
    writer = csv.DictWriter(buffer, fieldnames=fieldnames)
    writer.writeheader()
    inserted_values = 0
    while not finish_event.is_set() or not queue.empty():
        with condition:
            while queue.empty():
                condition.wait()
            try:
                augmented_aptnote = queue.get_nowait()
            finally:
                queue.task_done()
        writer.writerow(augmented_aptnote)
        inserted_values += 1
    with open(path, "wt") as f:
        print(buffer.getvalue(), file=f)
    relative_path = path.relative_to(Path.cwd())
    logger.info(
        f"Downloaded, parsed, and saved {inserted_values} document(s) in {relative_path}"
    )
Exemplo n.º 2
0
async def save_to_sqlite(
    queue: Queue, condition: Condition, finish_event: Event, path: Path
) -> None:
    async with aiosqlite.connect(path) as db:
        await db_init(db)

        inserted_values = 0
        while not finish_event.is_set() or not queue.empty():
            with condition:
                while queue.empty():
                    condition.wait()
                try:
                    augmented_aptnote = queue.get_nowait()
                except Exception as e:
                    logger.error(e)
                finally:
                    queue.task_done()
            await insert_values(db, augmented_aptnote)
            await db.commit()
            inserted_values += 1

    relative_path = path.relative_to(Path.cwd())
    logger.info(
        f"Downloaded, parsed, and saved {inserted_values} document(s) in {relative_path}"
    )
Exemplo n.º 3
0
    def valuation(self):
        my_value = 0
        enemy_value = 0
        queues = list()
        positions = list(filter(lambda player: player != DEAD, self.players))

        saved_nodes = [[content for content in row] for row in self.nodes]

        for cell in positions:
            queue = Queue()
            queue.put_nowait(cell)
            queues.append(queue)

        while not all(map(lambda queue: queue.empty(), queues)):
            for player, queue in enumerate(queues):
                if queue.empty():
                    continue
                if self.players[player][0] == -1:
                    item_row, item_col = queue.get_nowait()
                    self.nodes[item_row][item_col] = FREE
                    continue
                symbol = PLAYERS_SYMBOLS[player]
                item_row, item_col = queue.get_nowait()
                for _, neighbor_row, neighbor_col in self.free_neighbors(item_row, item_col):
                    self.nodes[neighbor_row][neighbor_col] = symbol

                    queue.put_nowait((neighbor_row, neighbor_col))
                    if player == self.me:
                        my_value += 1
                    else:
                        enemy_value += 1

        self.nodes = saved_nodes
        return (my_value - enemy_value) / (my_value + enemy_value + 1)
Exemplo n.º 4
0
 async def run(self, connection: OptolinkConnection,
               command_queue: asyncio.Queue):
     connection.flush()
     while True:
         # poll start bytes (0x05) and discard them
         byte = await connection.read()
         if byte[0] != 0x05:
             # we are not in synchronization phase and received a byte other than
             # the synchronization byte -> just wait for the next byte
             continue
         # when there is at least one command waiting in the queue, start the communication
         if not command_queue.empty():
             connection.write(b"\x01")
             # TODO: start measuring utilization here
             try:
                 while True:
                     cmd, fut = await asyncio.wait_for(command_queue.get(),
                                                       timeout=0.5)
                     connection.write(cmd.get_command_bytes())
                     val = await connection.read(
                         cmd.get_expected_bytes_count())
                     if all(it == 0x05 for it in val):
                         fut.set_exception(Exception("Command failed"))
                         # we must synchronize again
                         break
                     else:
                         if fut.done():
                             print(fut.result())
                             raise Exception("Future was already done")
                         fut.set_result(cmd.handle_result(val))
             except asyncio.TimeoutError:
                 continue
             finally:
                 pass
async def ffmpeg(task_queue: asyncio.Queue, task_id: asyncio.Queue):
    """
    function to process video using queue.
    """
    assert isinstance(FFMPEG, str)
    while not task_queue.empty():
        try:
            task = await task_queue.get()
            process_id = await task_id.get()
            cmd = process_input(task['input'], task['output'], task['rate'],
                                task['fps'], task['res'])
            proc = subprocess.Popen(cmd,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE)
            print('=' * 20 +
                  ' process {}: Converting file {} to output file {} '.format(
                      process_id, task['input'], task['output']) + '=' * 20)
            proc.communicate()
            ret = proc.returncode
            if ret != 0:
                print(
                    '=' * 20 +
                    ' process {}: Failed to converting file {} | return code {} '
                    .format(process_id, task['input'], ret) + '=' * 20)
            else:
                print('=' * 20 +
                      ' process {}: Completed converting file {} '.format(
                          process_id, task['input']) + '=' * 20)
                print(' Done ')
            task_queue.task_done()
            task_id.task_done()
        except queue.Empty:
            print("no task")
            break
Exemplo n.º 6
0
class EventManager(object):
    event_queue = None
    _subscribers = None

    def __init__(self):
        self.event_queue = Queue()
        self._subscribers = defaultdict(set)

    def subscribe(self, event_type, subscriber):
        logger.debug('subscribe %s for %s', subscriber, event_type)
        self._subscribers[event_type].add(subscriber)

    def unsubscribe(self, event_type, subscriber):
        #TODO: wtf? where is the code?!
        try:
            self._subscribers[event_type].remove(subscriber)
        except ValueError:
            pass

    def add_event(self, event):
        logger.debug('put event %s into queue', event)
        self.event_queue.put_nowait(event)  # TODO: WTF? NOWAIT?

    async def process_events(self):
        while True:
            # logger.debug('Processing %s events', self.event_queue.qsize())
            while not self.event_queue.empty():
                event = await self.event_queue.get()
                logger.debug('processing event %s', event)
                subscribers_list = self._subscribers.get(event['channel'], [])
                for subscriber in subscribers_list:
                    await subscriber.process_event(event)
                if not subscribers_list:
                    logger.debug('no listeners for event %s', event)
            await asyncio.sleep(0.1)
Exemplo n.º 7
0
async def parse(item_links: asyncio.Queue, sess: ClientSession,
                items: asyncio.Queue, prox: str):
    start = True
    while not item_links.empty() or start:
        start = False
        item = inst['_base'] + await item_links.get()
        for _ in range(inst['_retry']):
            await asyncio.sleep(1.5)
            try:
                async with sess.get(item,
                                    headers=inst['_headers'],
                                    proxy=prox,
                                    proxy_auth=auth) as resp:
                    txt = await resp.text()
                    page = fs(txt)
                    res = {'url': item}
                    for k, v in inst['fields'].items():
                        val = page.xpath(v['path'])
                        res[k] = v['type'](val) if val else None
                    table = page.xpath(inst['table']['home'])
                    for t in table:
                        for k, v in zip(t.xpath(inst['table']['title']),
                                        t.xpath(inst['table']['value'])):
                            res[k] = v

                    if res['name'] is None:
                        await item_links.put(item[len(inst['_base']):])
                    else:
                        print(datetime.now())
                        await items.put(res)
                        break
            except:
                await item_links.put(item[len(inst['_base']):])
                continue
Exemplo n.º 8
0
def breadthfirst(bt):
    """breadthfirst: binary tree -> list[Node]
    Purpose: Runs a breadth first search on a binary tree
    Consumes: a binary tree object
    Produces: a list of Nodes in breadth first search order
    Example: 
                    A 
    breadthfirst(  / \  ) -> [A B C]
                  B   C 
    If tree is empty, should return an empty list. If the tree
    is null, you should throw InvalidInputException. 
    """
    if bt is None:
        raise InvalidInputException("Input is None")
    if bt.isEmpty():
        return []

    Q = Queue()
    qlist = []
    qlist.append(bt.root())
    Q.put(bt.root())

    while not Q.empty():

        node = Q.get()

        if bt.hasLeft(node):
            Q.put(bt.left(node))
            qlist.append(bt.left(node))
        if bt.hasRight(node):
            Q.put(bt.right(node))
            qlist.append(bt.right(node))

    return qlist
Exemplo n.º 9
0
class Sender():
    def __init__(self, name: str):
        self.name = name
        self.frames = None
        self.queue = Queue()

    async def send(self, frames: list, channel: 'Channel'):
        await asyncio.sleep(1)
        self.frames = frames
        await channel.connect(self)
        print(self.name, "   :\tFrames received form network layer!")
        for i in range(0, len(frames)):
            copy = self.frames[i]
            # print(copy)
            if self.queue.empty():
                await self.queue.put(copy)
            print(self.name, "   :\tFrame sent in the channel!")
            try:
                await asyncio.wait_for(channel.transmit(self), timeout=10)
                val = await asyncio.wait_for(self.queue.get(), timeout=10)
                print(self.name, "   :\thas received", val, "from",
                      val.source_address)
            except asyncio.TimeoutError as e:
                print(self.name, "   :\tTimed Out!")
            await asyncio.sleep(3)
    async def work(self, queue: asyncio.Queue, client_session: aiohttp.ClientSession) -> List[Any]:
        while not queue.empty():
            await self.rate_limiter.acquire()

            tasks = self.pick_tasks(queue, self.rate_limiter.rate_limit)

            futures = []
            for i, task in enumerate(tasks):
                future = self.backend.fetch_offer_async(client_session, task)
                futures.append(future)
                self.work_index[i] = task
                self.counter = self.counter + 1

            done = await asyncio.gather(*futures, return_exceptions=True)
            for idx, result in enumerate(done):
                if isinstance(result, Exception):
                    failed_task = self.work_index[idx]
                    logging.debug("{}: Reschedule task: {} -> {}".format(self.backend.name(),
                                                                         failed_task.have, failed_task.want))
                    logging.debug(result)
                    queue.put_nowait(failed_task)
                    self.counter = self.counter - 1
                    self.just_failed = True
                else:
                    self.results.extend(result)

            self.work_index.clear()

            await self.handle_error()

        return self.results
Exemplo n.º 11
0
    async def _loop_manager(self, *, wait_time: int,
                            state_change_queue: Queue) -> None:
        start = 0
        running = True
        while running:
            if (time.time() - start) > wait_time:
                try:
                    await self._load_publications()
                except:  # pylint: disable=bare-except
                    self._logger.error(traceback.format_exc())
                self._logger.debug(f"Waiting {wait_time} seconds", )
                start = time.time()
            else:
                await asyncio.sleep(self._WAIT_TIME)
                self._logger.debug(
                    f"Remains {int(wait_time - (time.time() - start))} seconds, to execute the task."
                )

            if state_change_queue.empty():
                self._logger.debug("No new state.")
            else:
                new_state: State = state_change_queue.get_nowait()
                if new_state == State.STOP:
                    running = False
                else:
                    raise NotImplementedError
        await self._close()
        self._logger.info("Shutdown")
Exemplo n.º 12
0
 def fetch_url(self, work_queue: asyncio.Queue):
     gecko_driver = self.option_register.get_register('gecko_driver')
     while work_queue.empty() is not True:
         url: str = work_queue.get_nowait()
         url = url.replace("alert(1)", f"alert({self.random_int})")
         opts = Options()
         opts.headless = True
         driver = webdriver.Firefox(options=opts,
                                    executable_path=gecko_driver)
         try:
             driver.get(url)
             WebDriverWait(driver, 5).until(ec.alert_is_present())
             alert = driver.switch_to.alert
             if str(self.random_int) in alert.text:
                 self.print_queue.put_nowait(('success', f"{url}"))
                 with open(f"xss_report.txt", 'w') as f:
                     f.write(f"SUCCESS --> {url}")
             else:
                 self.print_queue.put_nowait(('warning', f"{url}"))
                 with open(f"xss_report", 'w') as f:
                     f.write(f"POSSIBLY --> {url}")
             alert.accept()
         except TimeoutException:
             self.print_queue.put_nowait(('error', f"{url}"))
         except (Exception, KeyboardInterrupt) as e:
             print(f"ERROR:ERROR {e.__str__()}")
             raise KeyboardInterrupt
         finally:
             driver.quit()
Exemplo n.º 13
0
    def fetch_url(self,
                  work_queue: asyncio.Queue,
                  headers,
                  listener=None,
                  placeholder=None):
        while work_queue.empty() is not True:
            url: str = work_queue.get_nowait()

            if placeholder and listener:
                url = url.replace(placeholder, listener)

            self.print_queue.put_nowait(('bold', f"Testing {url}"))
            try:
                with requests.Session() as session:
                    retry = Retry(connect=3,
                                  backoff_factor=1,
                                  status_forcelist=[429, 504])
                    adapter = HTTPAdapter(max_retries=retry,
                                          pool_connections=200,
                                          pool_maxsize=200)
                    session.mount('http://', adapter=adapter)
                    session.mount('https://', adapter=adapter)
                    session.get(url, headers=headers, timeout=5)
            except requests.RequestException as e:
                self.print_queue.put_nowait(('error', f"{e.__str__()}\n"))
Exemplo n.º 14
0
async def async_queue_reader(async_queue: asyncio.Queue, event: Event):
    """Checks the async queue for a message and if it exists prints it

    Think of this as our sink.

    Args:
        async_queue:
        event: Kill signal
    """
    while True and not event.is_set():
        try:
            """
            This part was hard to get right.
            First, you don't await get_nowait() unlike get(),
            I guess this is because no wait assumes the value is immediately available or throws.
            """
            msg = async_queue.get_nowait()
            logging.debug(f"async_queue_reader received: {msg}")
            async_queue.task_done()
        except QueueEmpty as err:
            """
            If our Queue is empty go back to sleep and check the event again.
            """
            pass
        finally:
            """
            We always want to sleep so the shut down event is continuously checked.
            """
            await asyncio.sleep(0)
    logging.debug('async_queue_reader shutting down!')
    assert async_queue.empty()  # I am curious if this always passes.
Exemplo n.º 15
0
async def emit(q: asyncio.Queue) -> None:
    print('adding to queue')
    await q.put(True)
    while not q.empty():
        print('looping')
        await asyncio.sleep(1)
    print('exiting emit')
Exemplo n.º 16
0
class Clix:
    def __init__(self, creator: Callable):
        self._creator = creator
        self._queue = Queue()
        self._executor = ProcessPoolExecutor()
        self._loop = None

    def reform(self,
               mapper: Callable,
               predicate: Callable = notnull) -> 'Clix':
        self._queue.put_nowait(
            lambda iterable: filter_map(iterable, mapper, predicate))
        return self

    def map(self, mapper: Callable) -> 'Clix':
        self._queue.put_nowait(lambda iterable: gather(*(
            self.__execute(mapper, i) for i in iterable)))
        return self

    def __execute(self, function: Callable, *args: Any) -> Generator:
        return self._loop.run_in_executor(self._executor, function, *args)

    def flatten(self, flattener: Callable = (lambda v: v)) -> 'Clix':
        self._queue.put_nowait(
            lambda iterable: self.__flatten(iterable, flattener))
        return self

    @staticmethod
    async def __flatten(iterable: Iterable, flattener: Callable) -> Generator:
        return (i for si in iterable for i in flattener(si))

    def distinct(self, keymaker: Callable) -> 'Clix':
        self._queue.put_nowait(
            lambda iterable: self.__distinct(iterable, keymaker))
        return self

    @staticmethod
    async def __distinct(iterable: Iterable, keymaker: Callable) -> ValuesView:
        return {keymaker(i): i for i in iterable}.values()

    def sieve(self, mapper: Callable, predicate: Callable = notnull) -> 'Clix':
        return self.reform(lambda i: self.__execute(mapper, i), predicate)

    async def apply(self, applier: Callable) -> Iterable:
        self._loop = get_event_loop()
        iterable = await self._creator()
        while not self._queue.empty():
            function = await self._queue.get()
            iterable = await function(iterable)
        iterable = await gather(*(map(applier, iterable)))
        self._executor.shutdown()
        return iterable

    async def list(self) -> List[Any]:
        iterable = await self.apply(self.__skip)
        return list(iterable)

    @staticmethod
    async def __skip(value: Any) -> Any:
        return value
Exemplo n.º 17
0
async def save_to_files(
    queue: Queue, condition: Condition, finish_event: Event, directory: Path
) -> None:
    directory.mkdir(parents=True, exist_ok=True)
    while not finish_event.is_set() or not queue.empty():
        with condition:
            while queue.empty():
                condition.wait()
            try:
                buffer, aptnote = queue.get_nowait()
            finally:
                queue.task_done()
        await write_file(buffer, directory, aptnote["filename"])
    relative_path = directory.relative_to(Path.cwd())
    no_of_files = len(list(directory.iterdir()))
    logger.info(f"Downloaded and saved {no_of_files} file(s) in {relative_path}")
Exemplo n.º 18
0
async def download_packages_from_queue(queue: asyncio.Queue, context: Context):

    while not queue.empty():
        package: DownloadPackageBody = queue.get_nowait()
        await download_package(package_name=package.packageName,
                               version=package.version,
                               context=context)
        queue.task_done()
Exemplo n.º 19
0
async def worker(name: str, queue: Queue):

    while not queue.empty():
        sleep_for = await queue.get()
        print(f'{name} is started')
        await asyncio.sleep(sleep_for)
        queue.task_done()
        print(f'{name} has work for {sleep_for:.2f} seconds')
Exemplo n.º 20
0
async def iter_queue(q: asyncio.Queue):
    while True:
        if q.empty():
            print("queue is empty... sleeping")
            yield await asyncio.sleep(0.1)
        else:
            print("queue has events... processing")
            yield await q.get()
Exemplo n.º 21
0
def save_to_json(queue: Queue, condition: Condition, finish_event: Event, path: Path):
    aptnotes = []
    while not finish_event.is_set() or not queue.empty():
        with condition:
            while queue.empty():
                condition.wait()
            try:
                augmented_aptnote = queue.get_nowait()
            finally:
                queue.task_done()
        aptnotes.append(augmented_aptnote)
    with open(path, "wt") as f:
        json.dump(aptnotes, f, sort_keys=True, indent=2)
    relative_path = path.relative_to(Path.cwd())
    logger.info(
        f"Downloaded, parsed, and saved {len(aptnotes)} document(s) in {relative_path}"
    )
Exemplo n.º 22
0
 async def listify_queue(queue: asyncio.Queue):
     item_list = []
     while not queue.empty():
         item = await queue.get()
         item_list.append(item)
     for item in item_list:
         await queue.put(item)
     return item_list
Exemplo n.º 23
0
async def task(name: str, work_queue: asyncio.Queue):
    timer = Timer(text=f"Task {name} elapsed time: {{:.1f}}")
    while not work_queue.empty():
        delay = await work_queue.get()
        print(f"Task {name} is running")
        timer.start()
        await asyncio.sleep(delay)
        timer.stop()
def clean_queue(queue: asyncio.Queue):
    while not queue.empty():
        try:
            queue.get_nowait()
        except asyncio.QueueEmpty:
            break
    # cancel all coroutines waiting for pkt
    for getter in queue._getters:  # type: ignore
        getter.cancel()
Exemplo n.º 25
0
class FileWriter:

    def __init__(self, torrent, loop):
        self.queue = Queue()
        self.torrent = torrent
        self.loop = loop
        self.path = torrent.get_info().file_name()
        self.is_done = False
        self.memory = {}

        files = glob.glob('pieces/*')
        for f in files:
            os.remove(f)

    def add_piece(self, piece):
        self.queue.put_nowait(piece)

    async def worker(self):
        while not self.is_done or not self.queue.empty():
            piece = await self.queue.get()

            await self._write_piece(piece)
            self.queue.task_done()

        await self._combine_pieces()
        self.loop.stop()

    async def _write_piece(self, piece):

        await self.loop.run_in_executor(
            _executor, self._writer,
            piece.piece, piece.get_data()
        )

        self.memory[piece.piece] = True

    async def _combine_pieces(self):

        with open(self.path, 'wb') as dest:
            keys = list(sorted(self.memory.keys()))
            for key in keys:
                src = open(f'pieces/{key}', 'rb').read()
                dest.write(src)

        files = glob.glob('pieces/*')
        for f in files:
            os.remove(f)

    def _writer(self, piece, data):

        file = open(f'pieces/{piece}', 'wb')
        file.write(data)
        file.close()

    async def finish_writing(self):
        self.is_done = True
        print('\nWriting to file')
Exemplo n.º 26
0
async def company_q_consumer(q: asyncio.Queue, result_lis: List):
    while not q.empty():
        linkedin_url = await q.get()
        try:
            company_data = await get_coy_profile(linkedin_url)
            result_lis += [[linkedin_url, company_data]]
        except Exception:
            print(f"Cannot enrich {linkedin_url}.")
            pass
Exemplo n.º 27
0
def subscriber1(q: asyncio.Queue, *args):
    print('subscriber 1 received event:')
    time.sleep(2)
    print('sub 1: ' + args[0])

    print(q.empty())
    print(q.qsize())
    msg = q.get_nowait()
    print(msg)
Exemplo n.º 28
0
class Buffer:
    def __init__(self, ack_callback, *, loop):
        self._ack_callback = ack_callback
        self._eof = False
        self._unacked = Queue(loop=loop)
        self._acked = deque()
        self._acked_size = 0

    def add(self, data, ack_size):
        self._unacked.put_nowait(UnackedData(data, len(data), ack_size))

    def eof(self):
        self._unacked.put_nowait(UnackedData(b'', 0, 0))
        self._eof = True

    async def read(self, size):
        assert size >= 0, 'Size can not be negative'
        if size == 0:
            return b''

        if not self._eof or not self._unacked.empty():
            while self._acked_size < size:
                data, data_size, ack_size = await self._unacked.get()
                if not ack_size:
                    break
                self._acked.append(AckedData(memoryview(data), data_size))
                self._acked_size += data_size
                self._ack_callback(ack_size)

        if self._eof and self._acked_size == 0:
            return b''

        if self._acked_size < size:
            raise AssertionError('Received less data than expected')

        chunks = []
        chunks_size = 0
        while chunks_size < size:
            next_chunk, next_chunk_size = self._acked[0]
            if chunks_size + next_chunk_size <= size:
                chunks.append(next_chunk)
                chunks_size += next_chunk_size
                self._acked.popleft()
            else:
                offset = size - chunks_size
                chunks.append(next_chunk[:offset])
                chunks_size += offset
                self._acked[0] = (next_chunk[offset:],
                                  next_chunk_size - offset)
        self._acked_size -= size
        assert chunks_size == size
        return b''.join(chunks)

    def unacked_size(self):
        return sum(self._unacked.get_nowait().ack_size
                   for _ in range(self._unacked.qsize()))
Exemplo n.º 29
0
async def task(name, writer, queue: asyncio.Queue):
    while not queue.empty():
        url: str = await queue.get()
        url = url.strip()
        logging.info(f"Start check {url} on {name}")
        result = await checker(url)
        writer.writerow([
            result["url"], result["hostname"], result["port"],
            result["result"], result["desc"], result["validityExpires"]
        ])
Exemplo n.º 30
0
async def task(name: str, work_queue: asyncio.Queue):
    timer = Timer(text=f"Task {name} elapsed time: {{:.1f}}")
    async with aiohttp.ClientSession() as session:
        while not work_queue.empty():
            url = await work_queue.get()
            print(f"Task {name} getting URL: {url}")
            timer.start()
            async with session.get(url) as response:
                await response.text()
            timer.stop()
Exemplo n.º 31
0
class Cloner(object):
    def __init__(self, root, max_depth, css_validate):
        self.visited_urls = []
        self.root, self.error_page = self.add_scheme(root)
        self.max_depth = max_depth
        self.moved_root = None
        if len(self.root.host) < 4:
            sys.exit('invalid target {}'.format(self.root.host))
        self.target_path = '/opt/snare/pages/{}'.format(self.root.host)

        if not os.path.exists(self.target_path):
            os.mkdir(self.target_path)
        self.css_validate = css_validate
        self.new_urls = Queue()
        self.meta = {}
        self.logger = logging.getLogger(__name__)

    @staticmethod
    def add_scheme(url):
        if url[-1] == '/':
            url = url.strip('/')
        if yarl.URL(url).scheme:
            new_url = yarl.URL(url)
            err_url = yarl.URL(url + '/status_404')
        else:
            new_url = yarl.URL('http://' + url)
            err_url = yarl.URL('http://' + url + '/status_404')
        return new_url, err_url

    async def process_link(self, url, level, check_host=False):
        try:
            url = yarl.URL(url)
        except UnicodeError:
            return None
        if url.scheme == ("data" or "javascript" or "file"):
            return url.human_repr()
        if not url.is_absolute():
            if self.moved_root is None:
                url = self.root.join(url)
            else:
                url = self.moved_root.join(url)

        host = url.host

        if check_host:
            if (host != self.root.host and self.moved_root is None) or \
                    url.fragment or \
                    (self.moved_root is not None and host != self.moved_root.host):
                return None

        if url.human_repr() not in self.visited_urls and (level + 1) <= self.max_depth:
            await self.new_urls.put((url, level + 1))

        res = None
        try:
            res = url.relative().human_repr()
        except ValueError:
            self.logger.error(url)
        return res

    async def replace_links(self, data, level):
        soup = BeautifulSoup(data, 'html.parser')

        # find all relative links
        for link in soup.findAll(href=True):
            res = await self.process_link(link['href'], level, check_host=True)
            if res is not None:
                link['href'] = res

        # find all images and scripts
        for elem in soup.findAll(src=True):
            res = await self.process_link(elem['src'], level)
            if res is not None:
                elem['src'] = res

        # find all action elements
        for act_link in soup.findAll(action=True):
            res = await self.process_link(act_link['action'], level)
            if res is not None:
                act_link['action'] = res

        # prevent redirects
        for redir in soup.findAll(True, attrs={'name': re.compile('redirect.*')}):
            if redir['value'] != "":
                redir['value'] = yarl.URL(redir['value']).relative().human_repr()

        return soup

    def _make_filename(self, url):
        host = url.host
        if url.is_absolute():
            file_name = url.relative().human_repr()
        else:
            file_name = url.human_repr()
        if not file_name.startswith('/'):
            file_name = "/" + file_name

        if file_name == '/' or file_name == "":
            if host == self.root.host or (self.moved_root is not None and self.moved_root.host == host):
                file_name = '/index.html'
            else:
                file_name = host
        m = hashlib.md5()
        m.update(file_name.encode('utf-8'))
        hash_name = m.hexdigest()
        return file_name, hash_name

    async def get_body(self, session):
        while not self.new_urls.empty():
            current_url, level = await self.new_urls.get()
            if current_url.human_repr() in self.visited_urls:
                continue
            self.visited_urls.append(current_url.human_repr())
            file_name, hash_name = self._make_filename(current_url)
            print('name: ', file_name)
            self.meta[file_name] = {}

            data = None
            content_type = None
            try:
                response = await session.get(current_url, headers={'Accept': 'text/html'}, timeout=10.0)
                content_type = response.content_type
                data = await response.read()

            except (aiohttp.ClientError, asyncio.TimeoutError) as client_error:
                self.logger.error(client_error)
            else:
                await response.release()
            if data is not None:
                self.meta[file_name]['hash'] = hash_name
                self.meta[file_name]['content_type'] = content_type
                if content_type == 'text/html':
                    soup = await self.replace_links(data, level)
                    data = str(soup).encode()
                with open(os.path.join(self.target_path, hash_name), 'wb') as index_fh:
                    index_fh.write(data)
                if content_type == 'text/css':
                    css = cssutils.parseString(data, validate=self.css_validate)
                    for carved_url in cssutils.getUrls(css):
                        if carved_url.startswith('data'):
                            continue
                        carved_url = yarl.URL(carved_url)
                        if not carved_url.is_absolute():
                            carved_url = self.root.join(carved_url)
                        if carved_url.human_repr() not in self.visited_urls:
                            await self.new_urls.put((carved_url, level + 1))

    async def get_root_host(self):
        try:
            async with aiohttp.ClientSession() as session:
                resp = await session.get(self.root)
                if resp.host != self.root.host:
                    self.moved_root = resp.url
                resp.close()
        except aiohttp.ClientError as err:
            self.logger.error("Can\'t connect to target host: %s", err)
            exit(-1)

    async def run(self):
        session = aiohttp.ClientSession()
        try:
            await self.new_urls.put((self.root, 0))
            await self.new_urls.put((self.error_page, 0))
            await self.get_body(session)
        except KeyboardInterrupt:
            raise
        finally:
            with open(os.path.join(self.target_path, 'meta.json'), 'w') as mj:
                json.dump(self.meta, mj)
            await session.close()
Exemplo n.º 32
0
class Cloner(object):
    def __init__(self, root):
        self.visited_urls = []
        self.root = self.add_scheme(root)
        if len(self.root.host) < 4:
            sys.exit('invalid taget {}'.format(self.root.host))
        self.target_path = '/opt/snare/pages/{}'.format(self.root.host)

        if not os.path.exists(self.target_path):
            os.mkdir(self.target_path)

        self.new_urls = Queue()

    @staticmethod
    def add_scheme(url):
        if yarl.URL(url).scheme:
            new_url = yarl.URL(url)
        else:
            new_url = yarl.URL('http://' + url)
        return new_url

    @asyncio.coroutine
    def process_link(self, url, check_host=False):
        url = yarl.URL(url)
        if check_host:
            if (url.host != self.root.host or url.fragment
                            or url in self.visited_urls):
                return None
        if not url.is_absolute():
            url = self.root.join(url)

        yield from self.new_urls.put(url)
        return url.relative().human_repr()

    @asyncio.coroutine
    def replace_links(self, data):
        soup = BeautifulSoup(data, 'html.parser')

        # find all relative links
        for link in soup.findAll(href=True):
            res = yield from self.process_link(link['href'], check_host=True)
            if res is not None:
                link['href'] = res

        # find all images and scripts
        for elem in soup.findAll(src=True):
            res = yield from self.process_link(elem['src'])
            if res is not None:
                elem['src'] = res

        # find all action elements
        for act_link in soup.findAll(action=True):
            res = yield from self.process_link(act_link['action'])
            if res is not None:
                act_link['action'] = res

        # prevent redirects
        for redir in soup.findAll(True, attrs={'name': re.compile('redirect.*')}):
            redir['value'] = yarl.URL(redir['value']).relative().human_repr()

        return soup

    @asyncio.coroutine
    def get_body(self):
        while not self.new_urls.empty():
            current_url = yield from self.new_urls.get()
            if current_url in self.visited_urls:
                continue
            self.visited_urls.append(current_url)
            if current_url.name:
                file_name = current_url.name
            elif current_url.raw_path != '/':
                file_name = current_url.path.rsplit('/')[1]
            else:
                file_name = 'index.html'
            file_path = os.path.dirname(current_url.path)
            if file_path == '/':
                file_path = self.target_path
            else:
                file_path = os.path.join(self.target_path, file_path[1:])

            print('path: ', file_path, 'name: ', file_name)

            if file_path and not os.path.exists(file_path):
                os.makedirs(file_path)

            data = None
            try:
                with aiohttp.Timeout(10.0):
                    with aiohttp.ClientSession() as session:
                        response = yield from session.get(current_url)
                        data = yield from response.read()
            except aiohttp.ClientError as client_error:
                print(client_error)
            else:
                response.release()
                session.close()
            if data is not None:
                if re.match(re.compile('.*\.(html|php)'), file_name):
                    soup = yield from self.replace_links(data)
                    data = str(soup).encode()
                with open(os.path.join(file_path, file_name), 'wb') as index_fh:
                    index_fh.write(data)
                if '.css' in file_name:
                    css = cssutils.parseString(data)
                    for carved_url in cssutils.getUrls(css):
                        if carved_url.startswith('data'):
                            continue
                        carved_url = yarl.URL(carved_url)
                        if not carved_url.is_absolute():
                            carved_url = self.root.join(carved_url)
                        if carved_url not in self.visited_urls:
                            yield from self.new_urls.put(carved_url)

    @asyncio.coroutine
    def run(self):
        yield from self.new_urls.put(self.root)
        return (yield from self.get_body())