async def process_partitions_queue(
    loop: asyncio.BaseEventLoop,
    partitions_queue: asyncio.Queue,
    results_queue: asyncio.Queue,
    server_address: URL,
    mission_template: Template,
    mission_loader: str,
    width: int,
    scale: int,
) -> Awaitable[None]:

    mission_name = mission_loader.split('/', 1)[0]

    async with aiohttp.ClientSession() as http:
        while True:
            partition = await partitions_queue.get()

            if partition is None:
                partitions_queue.task_done()
                return

            await process_partition(
                loop=loop,
                results_queue=results_queue,
                server_address=server_address,
                http=http,
                partition=partition,
                mission_template=mission_template,
                mission_loader=mission_loader,
                mission_name=mission_name,
                width=width,
                scale=scale,
            )
            partitions_queue.task_done()
Exemplo n.º 2
0
async def udp_writer(s: socket, oqueue: Queue) -> None:
    """Forward packets to the UDP socket."""

    while True:
        peer, data = await oqueue.get()
        try:
            s.sendto(data, peer)
        finally:
            oqueue.task_done()
Exemplo n.º 3
0
 async def call(loop, inq: asyncio.Queue):
     while True:
         v = await inq.get()
         logger.debug("consume[S]	v:%s", v)
         if v is None:
             inq.task_done()
             break
         v = await afn(v)
         logger.debug("consume[E]	v:%s", v)
         inq.task_done()
     await inq.join()
     logger.debug("consume[CLOSE]")
Exemplo n.º 4
0
 async def __call__(self, inq: asyncio.Queue):
     while True:
         v = await inq.get()
         logger.debug("aggregate[S]	v:%s", v)
         if v is None:
             inq.task_done()
             break
         await asyncio.sleep(0.1, loop=self.loop)
         print(v)
         logger.debug("aggregate[E]	v:%s", v)
         inq.task_done()
     await inq.join()
     logger.debug("aggregate[CLOSE]")
Exemplo n.º 5
0
 async def call(loop, inq: asyncio.Queue, outq: asyncio.Queue):
     while True:
         v = await inq.get()
         logger.debug("communicate[S]	v:%s", v)
         if v is None:
             inq.task_done()
             break
         v = await afn(v)
         logger.debug("communicate[E]	v:%s", v)
         await outq.put(v)
         inq.task_done()
     await inq.join()
     await outq.put(None)
     logger.debug("communicate[CLOSE]")
Exemplo n.º 6
0
 async def __call__(self, inq: asyncio.Queue, outq: asyncio.Queue):
     while True:
         v = await inq.get()
         logger.debug("communicate[S]	v:%s", v)
         if v is None:
             inq.task_done()
             break
         await asyncio.sleep(0.1, loop=self.loop)
         v = v * v
         logger.debug("communicate[E]	v:%s", v)
         await outq.put(v)
         inq.task_done()
     await inq.join()
     await outq.put(None)
     logger.debug("communicate[CLOSE]")
async def process_results_queue(
    results_queue: asyncio.Queue,
    total_points: int,
    output_file_path: Path,
) -> Awaitable[None]:

    point_size = calcsize(HEIGHT_PACK_FORMAT)
    output_size = point_size * total_points

    natural_size = humanize.naturalsize(
        output_size,
        binary=True,
        format='%.3f',
    )
    LOG.debug(f"output size: {natural_size}")

    processed_points = 0

    output_file_path.parent.parent.mkdir(parents=True, exist_ok=True)

    with output_file_path.open('wb') as f:
        f.truncate(output_size)

        while True:
            data = await results_queue.get()
            if not data:
                results_queue.task_done()
                return

            partition, values = data
            start = partition.start * point_size

            processed_points += (partition.end - partition.start) + 1
            progress = (processed_points / total_points) * 100

            LOG.debug(
                f"gather results for range "
                f"[{partition.start}:{partition.end}], "
                f"progress: {progress:.2f}%"
            )

            f.seek(start)
            f.write(values)

            results_queue.task_done()
Exemplo n.º 8
0
async def kafka_producer(client: Producer, conf: DotDict,
                         queue: Queue) -> None:
    """
    Async producer for Kafka.

    Pulls messages from queue.

    """
    while True:
        msg = await queue.get()
        client.produce(
            conf.kafka_topic,
            key=conf.page_url,
            value=json.dumps(msg),
            on_delivery=_ack_handler,
        )
        client.poll(0)
        queue.task_done()
Exemplo n.º 9
0
async def InterfaceSender(Client: IoTHubModuleClient,
                          InterfaceOut: asyncio.Queue):
    try:
        while (True):
            data = await InterfaceOut.get()
            print('Interface sender: Message to send.', data)
            msg = json.dumps(data)
            msg = Message(msg)
            try:
                await Client.send_message_to_output(msg, 'InterfaceOut')
                InterfaceOut.task_done()
            except Exception as ex:
                print(
                    'Interface sender: Unexpected error in sender: {}'.format(
                        ex))
            print('Interface sender: Finished sending')
    except asyncio.CancelledError:
        print('Interface sender: Task cancelled')
async def worker(name, in_q: asyncio.Queue, output: asyncio.Queue):
    print(f'worker {name} started')
    while True:
        paperid, authorids = await in_q.get()
        if paperid is None:
            in_q.task_done()
            break
        authors = dict()
        for aid in authorids:
            for pos in db.postlist(aid):
                doc = db.get_document(pos.docid)
                data: bytes = doc.get_data()
                authors[aid] = data.decode('utf-8')
        result = [authors.get(int(i), 'MISSING_DATA') for i in authorids]
        s = json.dumps({'PaperId': paperid, 'Author': {'set': result}}) + '\n'
        await output.put(s)
        in_q.task_done()
    print(f'worker {name} stopping')
Exemplo n.º 11
0
async def independent_task(queue: asyncio.Queue):
    """tento task je nezavisly na parentovi (neposila mu zadna data zpet, je to napriklad logovani)

    """
    print("starting the child")
    val = await queue.get()

    while val is not None:
        print("Received is %s and processing data" % str(val))
        await asyncio.sleep(
            0.5
        )  # procesovani zabere nejaky cas, aby se demonstrovala kapacita fronty
        print("Received data processed")
        queue.task_done()
        val = await queue.get()

    queue.task_done()  # oznacuje poslední None hodnotu, ktera ukoncila cyklus
    print("The client is done here")
Exemplo n.º 12
0
async def worker(name, in_q: asyncio.Queue, output: asyncio.Queue):
    s = get_async_localhost_session()
    print(f'worker {name} started')
    while True:
        x = await in_q.get()
        if x is None:
            in_q.task_done()
            break
        expr = f'search(mag_papers,q=ConferenceSeriesId:{x["ConferenceSeriesId"]},fl=PaperId, sort="PaperId asc",qt=/export)'
        async with s.collection('mag_papers').stream.expr(expr) as resp:
            response = await resp.json()
            paperids = [doc['PaperId'] for doc in response['result-set']['docs'][:-1]]
            conference_series_name = x['DisplayName']
            for paper in paperids:
                await output.put(json.dumps({'PaperId': paper, 'ConferenceSeries': {'set': conference_series_name}}) + '\n')
        in_q.task_done()
    print(f'worker {name} stopping')
    await s.close()
Exemplo n.º 13
0
async def token_consumer(in_q: asyncio.Queue):
    async with aiofile.AIOFile(sys.argv[3], 'wb') as f:
        write = aiofile.Writer(f)
        eos = False
        while not eos:
            tokens = deque([await in_q.get()])
            while not in_q.empty():
                token = await in_q.get()
                tokens.append(token)
            if tokens[-1] is OES:
                eos = True
                tokens.pop()
            if tokens:
                transform_tasks = [transform_token(t) for t in tokens]
                transformed = await asyncio.gather(*transform_tasks)
                await write(b''.join(transformed))
            in_q.task_done()
        await f.fsync()
async def content_consumer(worker_id: int, queue: asyncio.Queue,
                           tmpdir: str) -> None:
    """
    Save a content to the localstorage
    """
    while True:
        content = await queue.get()
        # Add timeout to see how it works
        if os.environ.get("DEBUG", False):
            await asyncio.sleep(1 + (1 * worker_id))

        filename = os.path.join(tmpdir, f"async_{str(uuid.uuid4())}.mov")

        async with aiofiles.open(filename, "wb") as video_file:
            await video_file.write(content)
            logger.debug(f"[WORKER {worker_id}] Finished writing {filename}")

        queue.task_done()
Exemplo n.º 15
0
async def downloader(db: DB, info_queue: Queue):
    """合集图片下载器"""
    logger.info(f"任务:{asyncio.current_task().get_name()} 启动")
    while True:
        collection_number, collection_name, url_list = await info_queue.get()
        logger.info(f"开始下载合集:{collection_name},共有{len(url_list)}张图片")

        # 此合集下载失败的图片数量,若大于 10,返回None 合集下载失败
        fail_count = 0
        for img_url in url_list:
            await sleep(random.uniform(.5, 2.5))

            file_name = img_url.split("/")[-1]

            # 检查文件夹命名的格式,删除命名中的非法字符
            for char in invalid_chars_in_path:
                if char in collection_name:
                    collection_name = collection_name.replace(char, "")
            dir_path = os.path.join(DL_PATH, collection_name)

            if not os.path.exists(dir_path):
                try:
                    os.mkdir(dir_path)
                except NotADirectoryError:
                    os.mkdir(os.path.join(DL_PATH, "unknown"))

            file_path = os.path.join(dir_path, file_name)

            # 如果已经下载就跳过
            if os.path.exists(file_path):
                continue

            try:
                await dl_session.get(img_url, file_path=file_path)
                logger.debug(f"{file_path} 下载完毕")

            except ConnectionError:
                fail_count += 1

        if fail_count < 10:
            db.update_picture_status(collection_number, 1)
            info_queue.task_done()
        else:
            logger.warning(f"合集:{collection_name} 由于图片失败太多导致下载失败")
Exemplo n.º 16
0
async def bouquet_designs_consumer(
    bouquet_designs_queue: asyncio.Queue,
    flowers_queue: asyncio.Queue,
    bouquets_queue: asyncio.Queue,
):
    """Taking care on bouquets and their state."""
    worker_id = int(random.random() * 1000)
    bd_str = await bouquet_designs_queue.get()
    bd = BouquetDesign.from_str(bd_str)
    logger.info(
        f"Proccesing of a bd {repr(bd_str)} "
        f"started by worker id {repr(worker_id)}..."
    )
    bouquet = Bouquet(name=bd.name, design=bd)
    while True:
        fl_str = await flowers_queue.get()
        logger.debug(
            f"Flower {repr(fl_str)} received by "
            f"worker id {repr(worker_id)}"
        )
        flower = Flower.from_str(fl_str)
        try:
            await bouquet.use(
                flower, additional_debug_str=f"Worker id {repr(worker_id)}"
            )
        except KeyError:
            # if flower is not compatible with the design
            # we put it back to the queue and allow switch the context
            # to another task
            await flowers_queue.put(fl_str)
            logger.debug(
                f"Worker id {repr(worker_id)} Flower {repr(fl_str)}"
                f" returned to the queue"
            )
            await asyncio.sleep(0)
        else:
            if bouquet.is_ready:
                logger.info(
                    f"Bouquet {repr(bouquet.to_str())} produced by worker"
                    f" {repr(worker_id)}"
                )
                bouquet_designs_queue.task_done()
                await bouquets_queue.put(bouquet.to_str())
                asyncio.current_task().cancel()
Exemplo n.º 17
0
async def consumer(q: asyncio.Queue, name):
    """ Реализация Consumer """
    progress = tqdm(desc=f'consumer #{name}', leave=False)
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
    }
    async with aiohttp.ClientSession(headers=headers) as session:
        while True:
            url = await q.get()
            if url_is_fetched(url):
                q.task_done()
                continue
                data = await fetch(url, session)
                with open(get_filename_for_write(url), 'w',
                          encoding='utf-8') as file:
                    file.write(data.decode('utf-8'))
            progress.update()
            q.task_done()
Exemplo n.º 18
0
 async def worker(self, queue: asyncio.Queue):
     if self.use_requests:
         await self._requests_worker(queue)
     else:
         async with aiohttp.ClientSession(
                 raise_for_status=True,
                 headers=[("User-Agent", ua.random)],
                 timeout=aiohttp.ClientTimeout(total=60)) as sess:
             while True:
                 url = await queue.get()
                 try:
                     es.Page.get(id=url)
                     log.info('page existed, skip {}'.format(url))
                 except elasticsearch.NotFoundError:
                     try:
                         # resp, html = await fetch.get(url)
                         async with sess.get(url) as resp:
                             log.info('page fetching {}'.format(url))
                             html = await resp.text()
                             log.info('page downloaded {}'.format(url))
                             self.parse(url, resp, html)
                             await asyncio.sleep(3)
                             log.info('page scraped {}'.format(url))
                     except aiohttp.ClientResponseError as e:
                         page = es.Page(
                             from_url=url,
                             resolved_url=str(e.request_info.real_url),
                             http_status=e.status,)
                         page.save()
                         log.info("fetch error & skiped: {}".format(e))
                         log.error(e)
                         self.error_urls.append(url)
                     except Exception as e:
                         log.info(
                             "scrape internal error & skiped: {}".format(e))
                         log.error(e)
                         self.error_urls.append(url)
                 except Exception as e:
                     log.info("scrape internal error & skiped: {}".format(e))
                     log.error(e)
                     self.error_urls.append(url)
                 finally:
                     queue.task_done()
Exemplo n.º 19
0
    async def LogFileDict_toList(self, queue_df: asyncio.Queue, Logfile_List: Optional[list] = None) -> str:
        """
        :param self: Cisco_Function new instance
        :param Logfile_List: List which stores each line of Cisco firewall config
        """
        # Version 2.0 - old Question / Code execution efficiency big problem
        # for item in Logfile_List:
        #     self.df_cisco = self.df_cisco.append(self.Analyze_CiscoContent(each_content=item), ignore_index=True)

        # Create New List to store all generated dict improve from 34062ms to 34ms about 1000x
        # Version 2.1 - New Version / -> Store each config line in dict and Creat New DataFrame
        #

        while True:
            insert_dict = await queue_df.get()
            queue_df.task_done()
            if queue_df.empty() and insert_dict == "readline_complete":
                return "Finished! - LogFileDict_toList"
            self.dict_list.append(insert_dict)
Exemplo n.º 20
0
class AsyncDownloader:
    def __init__(self, concurrence=10, headers=None):
        self.queue = Queue(concurrence)
        self.headers = headers
        self.concurrence = concurrence

    async def push_task(self, urls):
        for idx, url in enumerate(urls):
            await self.queue.put((idx, url))
            print(f"pushed {idx}")

    async def worker(self):
        async with aiohttp.ClientSession() as session:
            while True:
                # print('worker start')
                idx, url = await self.queue.get()
                # print(f'{idx} start')
                rsp = await self.download(session=session,
                                          url=url,
                                          headers=self.headers)
                await self.save(rsp)
                self.queue.task_done()

    async def process(self, urls):
        tasks = asyncio.create_task(self.push_task(urls))
        workers = [
            asyncio.create_task(self.worker()) for _ in range(self.concurrence)
        ]
        await tasks
        await self.queue.join()
        for worker in workers:
            worker.cancel()

    @staticmethod
    async def download(session, url, headers=None, method='GET'):
        async with session.request(method=method, url=url,
                                   headers=headers) as rsp:
            data = await rsp.text()
        return data

    async def save(self, data):
        pass
Exemplo n.º 21
0
class Test:
    def __init__(self):
        self.que = Queue()
        self.pue = Queue()

    async def consumer(self):
        while True:
            try:
                print('consumer', await self.que.get())
            finally:
                try:
                    self.que.task_done()
                except ValueError:
                    if self.que.empty():
                        print("que empty")

    async def work(self):
        while True:
            try:
                value = await self.pue.get()
                print('producer', value)
                await self.que.put(value)
            finally:
                try:
                    self.pue.task_done()
                except ValueError:
                    if self.pue.empty():
                        print("pue empty")

    async def run(self):
        tasks = [asyncio.ensure_future(self.work()),
                 asyncio.ensure_future(self.consumer())]

        await asyncio.wait([self.pue.put(i) for i in range(10)])

        print('p queue join')
        await self.pue.join()
        print('p queue is done & q queue join')
        await self.que.join()
        print('q queue is done')

        asyncio.gather(*tasks).cancel()
Exemplo n.º 22
0
async def update_products(queue_in: asyncio.Queue, queue_out: asyncio.Queue):
    session = FuturesSession()
    while True:
        product = await queue_in.get()
        if product is None:
            queue_in.task_done()
            await queue_out.put(None)
            break
        queue_in.task_done()
        url = product.url if product.update_url is None else product.update_url
        soup = await get_soup(
            url, session, product.shop.cookies
        )  #     soup = await get_soup(url, session, shop.cookies)
        try:
            updated_product = await product.shop.product_parser(product, soup)
        except Exception as e:
            logger.exception("Exception occured:  %s",
                             getattr(e, "__dict__", {}))
        await queue_out.put(product)
        print(f'updated {product.shop.name} {product.name}')
async def worker(name, in_q: asyncio.Queue, output: asyncio.Queue):
    connection = await asyncpg.connect(user='******', database='mag')
    print(f'worker {name} started')
    while True:

        paperid, authorids = await in_q.get()
        if paperid is None:
            in_q.task_done()
            break
        query = ', '.join(authorids)
        q = f'SELECT "AuthorId", "DisplayName" FROM authors WHERE "AuthorId" in ({query});'
        # print(f'{name}: {q}')
        authors = await connection.fetch(q)
        authors = dict([tuple(a) for a in authors])
        result = [authors.get(int(i), 'MISSING_DATA') for i in authorids]
        s = json.dumps({'PaperId': paperid, 'Author': {'set': result}}) + '\n'
        # await output.put(s)
        in_q.task_done()
    print(f'worker {name} stopping')
    await connection.close()
Exemplo n.º 24
0
async def ffplay(queue: asyncio.Queue):
    """
    Play media asynchronously.
    Each task runs endlessly until .cancel()
    """
    assert isinstance(FFPLAY, str)

    while True:
        filein = await queue.get()

        cmd = [FFPLAY, "-loglevel", "warning", "-autoexit", str(filein)]

        proc = await asyncio.create_subprocess_exec(*cmd)

        ret = await proc.wait()

        if ret != 0:
            print(filein, "playback failure", file=sys.stderr)

        queue.task_done()
Exemplo n.º 25
0
Arquivo: core.py Projeto: mouncg/NIBO
async def worker(q: asyncio.Queue):
    global threads, uLock, tasks
    while True:
        xfn = await q.get()  # type: Thread
        print("x")
        uLock[f"{xfn.uid}"] = False
        if run.get(xfn.uid) is True:
            print("y")
            # Get a "work item" out of the queue.

            # Sleep for the "sleep_for" seconds.
            if xfn.stopped():
                xfn.start()
                threads.append(xfn)

                # Notify the queue that the "work item" has been processed.
                q.task_done()
                q.put_nowait(xfn)
            else:
                print(f"Running?\nproc syst:{xfn}\n{xfn.stopped()}")
Exemplo n.º 26
0
async def upload_training(session: aiohttp.ClientSession,
                          training_data_queue: asyncio.Queue,
                          strava_access_token: str):
    while True:
        training_data = await training_data_queue.get()

        data = {'file': training_data, 'data_type': 'tcx'}

        for i in range(3):
            response = await post_training(session, data, strava_access_token)
            print('upload status:', response.status)
            print('upload resp:', await response.read())
            if response.status != 429:
                print("uploaded training")
                training_data_queue.task_done()
                break
            print(f"sleeping for 15 minutes")
            await asyncio.sleep(60 * 15 + 10)
        else:
            print('ERROR: not uploaded due to rate limits')
Exemplo n.º 27
0
    async def LogFile_toList(self, queue_df: asyncio.Queue):
        """
        :param self:
        :param queue_df:
        """
        # For Version 2.0, The old code block below limits the performance of creating the Pandas DataFrame
        # for item in content_list:
        #     try:
        #         self.df_topsec = self.df_topsec.append(self.Analyze_TopSec(each_line=item), ignore_index=True)
        #     except (NameError, TypeError, RuntimeError, IndexError) as err:
        #         config.Logger.log_warning("Below Config is not Supported by this Program! Please Check")
        #         print(item)

        # New Version 2.1:
        while True:
            process_dict = await queue_df.get()
            queue_df.task_done()
            if queue_df.empty() and process_dict == "complete_process":
                return "LogFile_toList Function - complete"
            self.df_dict_list.append(process_dict)
Exemplo n.º 28
0
async def collect_number(db: DB, tag_detail_url_queue: Queue,
                         number_queue: Queue):
    """从tag详情页中提取合集编号,并将未记录的编号入库、入队"""
    logger.info(f"任务:{asyncio.current_task().get_name()} 启动")
    db_numbers = db.get_all_collection_numbers()
    while True:
        url = await tag_detail_url_queue.get()

        numbers = await extract_number_in_tag(url)
        new_numbers = set(numbers) - set(db_numbers)
        if not new_numbers:
            tag_detail_url_queue.task_done()
            continue

        for number in new_numbers:
            await number_queue.put(number)
        db.batch_add_collection_number(new_numbers)

        tag_detail_url_queue.task_done()
        logger.debug(f"新入库、入队 {len(new_numbers)} 条编号")
Exemplo n.º 29
0
class Channel:
    """Holds messages for an Event in Bus"""
    def __init__(self):
        """Creates a Channel which has it's own queue of messages"""
        self._queue = Queue()

    async def __aiter__(self):
        return self

    async def __anext__(self):
        data = await self._queue.get()
        self._queue.task_done()

        if data == 'STOP':
            raise StopAsyncIteration

        return data

    async def put(self, data):
        await self._queue.put(data)
Exemplo n.º 30
0
async def poll_until_done(session: aiohttp.ClientSession, q1: asyncio.Queue,
                          q2: asyncio.Queue):
    """Poll specific experiment until it's done or failed."""

    counter_max = int(os.getenv("UC1D_POLLING_RETRIES", "30"))
    freq = float(os.getenv("UC1D_POLLING_FREQUENCY", "0.1"))

    while True:
        # Retrieve first item from queue
        id, req_id, href = await q1.get()
        headers = {"X-Request-Id": req_id}

        # Poll status of simulation
        counter = 0
        href_result = None
        while counter < counter_max:
            if counter > (counter_max / 2):
                freq *= 2
            logger.log("REQUEST", f"GET {href}")
            async with session.get(href, headers=headers) as res:
                rep = await res.json()
                status = rep["status"]
                logger.trace(
                    f"Polling status of simulation for individual '{id}': {status}"
                )

                if status == "DONE":
                    href_result = rep["linkToResult"]
                    break
                if status == "FAILED":
                    logger.warning("Simulation failed")
                    break

                counter += 1
                await asyncio.sleep(freq)

        # Enqueue link to result
        await q2.put((id, req_id, href_result))

        # Indicate that a formerly enqueued task is complete
        q1.task_done()
Exemplo n.º 31
0
    async def process(self, queue: Queue, futures: List[Future],
                      batch: List[FigmentContext]):
        """ Have the Figmentator process a batch """
        try:
            results = await self.loop.run_in_executor(
                self.executor, self.figmentator.figmentate, batch)
            for future, result in zip(futures, results):
                # Set the result of the future
                future.set_result(result)

                # Need to notify the task queue for each item in the batch
                queue.task_done()
        except Exception as e:  # pylint:disable=broad-except
            logging.error("Caught exception: %s", str(e))
            self.ready.clear()
            for future in futures:
                # Set the exception on the future
                future.set_exception(e)

                # Need to notify the task queue for each item in the batch
                queue.task_done()
Exemplo n.º 32
0
 async def _subscription_queue_processor(self,
                                         queue: asyncio.Queue) -> None:
     try:
         while True:
             item = await queue.get()
             job_state = item["fields"]
             self._state[job_state["id"]] = job_state
             queue.task_done()
             job = self._get_job_no_fetch(job_state["id"])
             if (JobStatus.is_completed(job.status)
                     and job.id in self._job_wait_futures):
                 self._job_wait_futures[job.id].set_result(job)
                 del self._job_wait_futures[job.id]
     except asyncio.CancelledError:
         logger.debug(
             "core.get_jobs subscription work processing is getting canceled"
         )
         raise
     except Exception as exc:
         logger.exception("exception while processing core.get_jobs data",
                          exc_info=exc)
Exemplo n.º 33
0
async def fetch_simulation_result(session: aiohttp.ClientSession,
                                  q: asyncio.Queue, q_repr_all: list):
    """Get the simulation result and parse it as dataframe."""

    while True:
        # Retrieve first item from queue
        id, req_id, href = await q.get()
        headers = {"X-Request-Id": req_id}

        # Get simulation result
        logger.info(f"Retrieving result of simulation for individual '{id}''")
        async with session.get(href, headers=headers) as res:
            logger.log("REQUEST", f"GET {href}")
            rep = await res.json()
            logger.trace(json.dumps(rep, indent=JSON_DUMPS_INDENT))

            # Enqueue for post-processing
            q_repr_all.append((id, rep))

        # Indicate that a formerly enqueued task is complete
        q.task_done()
Exemplo n.º 34
0
async def handle_main_events(
    run_state: RunState,
    mqtt_send_q: asyncio.Queue,
    garage_events_q: asyncio.Queue,
    main_events_q: asyncio.Queue,
    poller_ticker_q: asyncio.Queue,
):
    handlers = {
        "GarageStateEvent": handle_main_event_garage,
        "MqttMsgEvent": handle_main_event_mqtt,
    }
    while True:
        main_event = await main_events_q.get()
        logger.debug(f"Handling {main_event.event}...")
        handler = handlers.get(main_event.event)
        if handler:
            await handler(main_event, run_state, mqtt_send_q, garage_events_q,
                          poller_ticker_q)
        else:
            logger.error(f"No handler found for {main_event.event}")
        main_events_q.task_done()
Exemplo n.º 35
0
    async def _download_worker(self, wk_name: str, queue: asyncio.Queue):
        downloaded_prices = []
        try:
            while True:
                day: date = await queue.get()
                tic = monotonic()
                prices = await self._download_pvpc_prices(day)
                took = monotonic() - tic
                queue.task_done()
                if not prices:
                    self._logger.warning(
                        "[%s]: Bad download for day: %s in %.3f s", wk_name, day, took
                    )
                    continue

                downloaded_prices.append((day, prices))
                self._logger.debug(
                    "[%s]: Task done for day: %s in %.3f s", wk_name, day, took
                )
        except asyncio.CancelledError:
            return downloaded_prices
Exemplo n.º 36
0
async def report_sightings(sub_endpoint: str, sightings_queue: asyncio.Queue):
    """
    Starts a ZeroMQ publisher on the given endpoint and publishes sightings from
    the sightings_queue to Threat Bus.
    @param sub_endpoint A host:port string to connect to via ZeroMQ
    @param sightings_queue The queue to receive sightings from
    """
    socket = zmq.Context().socket(zmq.PUB)
    socket.connect(f"tcp://{sub_endpoint}")
    topic = "stix2/sighting"
    logger.info(f"Forwarding sightings to Threat Bus at {sub_endpoint}/{topic}")
    while True:
        sighting = await sightings_queue.get()
        if type(sighting) is not Sighting:
            logger.warning(
                f"Ignoring unknown message type, expected Sighting: {type(sighting)}"
            )
            continue
        socket.send_string(f"{topic} {sighting.serialize()}")
        sightings_queue.task_done()
        logger.debug(f"Reported sighting: {sighting}")
Exemplo n.º 37
0
 async def stream_to_postgres(self, q: asyncio.Queue):
     try:
         conn = await asyncpg.connect(self.config.conn_uri)
     except Exception as e:  # noqa
         self._exception = e
         self.file_reader_task.cancel()
         return 0
     log.debug('[stream_to_postgres] Connected to %s', self.config.conn_uri)
     num_rows_written = 0
     try:
         await conn.execute(f'''
             CREATE TABLE IF NOT EXISTS {self.config.table_name} (
                 {self.schema})''')
         eos = False
         while not eos:
             records = deque([await q.get()])
             while not q.empty():
                 record = await q.get()
                 records.append(record)
             if records[-1] is EOS:
                 eos = True
                 records.pop()
             if records:
                 status = await conn.copy_records_to_table(
                     self.config.table_name, records=records)
                 num_rows_written += parse_insert_status_string(status)
                 q.task_done()
         log.debug('[stream_to_postgres] Wrote %d rows', num_rows_written)
     except KeyboardInterrupt:
         log.warning('[stream_to_postgres] User interrupt')
     except asyncio.CancelledError:
         log.warning('[stream_to_postgres] Task cancelled')
         raise
     except Exception as e:  # noqa
         log.error('[stream_to_postgres] Exception: %s', e)
         raise
     finally:
         await conn.close()
     print('[read_file] returning')
     return num_rows_written
Exemplo n.º 38
0
class WebCrawler:
    '''
        WebCrawler class, starts at a root domain of a given resource.
    
        It starts on the root page, finds all 
    
        Initialize a new webcrawler instance.
        
        @param(basePath): The root of the domain to crawl 
        
    '''
    def __init__(self, basePath, max_tasks=25):
        
        # max concurrent tasks
        self.max_tasks = max_tasks
        
        # we have seen this url
        self.processed = set()
        
        # BasePath of url to start crawl, should be root of a domain 
        self.basePath = basePath
        
        # event loop, we are not fallbacking to iocp (win32) or select or any sort of other event loop, we will only use asyncio provided event loop
        self.loop = asyncio.get_event_loop()
        
        # create our session, which encapsulates a connection pool
        self.session = aiohttp.ClientSession(loop=self.loop)
        
        # get Queue
        self.queue = Queue(loop=self.loop)
        
        # first url
        self.queue.put_nowait(self.basePath)
        
        # JSON for visualization
        self.data = []
        
    
    '''
        Check if this is static data
    '''
    def _is_static_(self):
        # As far as static vs. dynamic, it's because it looks like the resource is cachable (making it "static"). 
        # You need a pragma: no-cache and/or a cache-control: no-cache header for it to really be a dynamic asset.
        pass
        
        
    '''
        Get all static assets on a page
    '''
    def get_static(self, s, url):
        # hacky but works
        scripts = [ x['src'] for x in s.findAll('script') if x.has_attr('src') and (x["src"].startswith('/') and not x['src'][1] == '/')]
        styles = [ x['href'] for x in s.findAll('link') if x.has_attr('href') and x["href"].startswith('/') ]
        return scripts + styles
        
    '''
        Cleanup on aiohttp
    '''
    def close(self):
        try:
            # aiohttp keeps a TCP connection alive for 30secs, this explicitly closes it
            self.session.close()
        except:
            pass
  
    '''
        Process is a coroutine which our tasks/workers/threads/coroutines/whatever will do their corresponding work.
        Each process will fetch their urls from the queue for processing.
    '''
    async def process(self):
        try:
            while True:
                try:
                    # suspend until we get a new url to work on
                    url = await self.queue.get()
                    
                    # remove trailing slash
                    if url[-1] == '/':
                        url = url[:-1]
                    
                    # we have not seen this url, so we fetch it and add it
                    if url not in self.processed:
                        self.processed.add(url)
                        
                        # suspend execution until we get data from our HTTP request
                        resp = await self.fetch(url)
                        
                        if resp != None:
                            # add to sites
                            self.data.append(resp)
                        
                            # go through each link and add them to the queue if we have not traversed them
                            links = [x for x in resp['links'] if x.startswith('/') or x.startswith(url)]
                            for link in links:
                                
                                # formatting
                                if not link.startswith(self.basePath):
                                    link = self.basePath + link
                                
                                if '#' in link:
                                    link = link[:link.index('#')]
                                
                                # add it to our queue for processing
                                if link not in self.processed:
                                    if link != '' and link != None:
                                        self.queue.put_nowait(link)
                                    
                    # this task is done
                    self.queue.task_done()
                    
                            
                        
                except Exception as err:
                    pass
                    
        except asyncio.CancelledError:
            pass
  
    
    '''
        Parsed a url for links and other stuff too
    '''
    def parse(self, data, url):
        # parse a single url
        s = soup(data.decode('utf-8', 'ignore'), "html.parser")
        
        # get links
        links = [ x['href'] for x in s.findAll('a') if x.has_attr('href') ]
        
        # get assets 
        assets = self.get_static(s, url)
        
        # get title
        title = s.find('title')
        
        if title != None:
            title = title.text
        else:
            title = ''
            
        return {
            'url': url,
            'title': title,
            'links': links,
            'assets': assets
        }
    

    '''
        Put our JSONStatham in a file
    '''
    def _save_file(self):

        # save data
        with open('sitemap.json', 'w') as sitemapfile:
            json.dump({
                "sitemap": "Sitemap generated for URL {} on {}. {} pages parsed.".format(self.basePath, datetime.now(), len(self.processed)),
                "sites": self.data
            }, sitemapfile)
    
    
    '''
        Start ze crawl
    '''
    def crawl(self):
        try:
            # crawl until complete
            self.loop.run_until_complete(self.__crawl__())
            
        except KeyboardInterrupt:
            sys.stderr.flush()
        finally:
            pass
            

  
    '''
        Asynchronous crawl
    '''
    async def __crawl__(self):
        print('Starting webcrawler on url {}'.format(self.basePath))
        
        t1 = time.time()
        # make tasks that are processing the queue
        tasks = [asyncio.ensure_future(self.process(), loop=self.loop) for _ in range(self.max_tasks)]
        
        # aggregate tasks and squash exceptions
        asyncio.gather(*tasks, return_exceptions=True)
        
        # all queue items should call task_done for each put
        await self.queue.join()
        
        # cancel tasks
        for t in tasks:
             t.cancel()
        
        self.close()
        self.loop.stop()
        
        # save JSON file for viewing
        self._save_file()
        
        # 
        print('{} pages processed in {} secs. Data saved in sitemap.json'.format(len(self.processed), time.time() - t1))
        
        # leave
        exit(1)

    '''
        HTTP request a page.
    '''         
    async def fetch(self, url):
        try:
                
            # alright, so i really should be handling redirects myself, but i'm not, because of reasons
            async with self.session.get(url, allow_redirects=False) as r:
                assert r.status == 200
                # Get the page and parse it
                resp = self.parse(await r.read(), url)    
                return resp
        except:
            self.queue.task_done()
Exemplo n.º 39
0
class Crawler(object):
    """Crawl a set of URLs.

    This manages two sets of URLs: 'urls' and 'done'.  'urls' is a set of
    URLs seen, and 'done' is a list of FetchStatistics.
    """

    def __init__(
        self,
        roots,
        scraper=None,
        data_handler=None,
        exclude=None,
        strict=True,  # What to crawl.
        max_redirect=5,
        max_tries=10,  # Per-url limits.
        max_tasks=10,
        max_connections_per_host=3,
        *,
        loop=None
    ):
        self.loop = loop or asyncio.get_event_loop()
        self.roots = roots
        self.max_connections_per_host = max_connections_per_host
        self.scraper = scraper
        self.data_handler = data_handler
        self.exclude = exclude
        self.strict = strict
        self.max_redirect = max_redirect
        self.max_tries = max_tries
        self.max_tasks = max_tasks
        self.q = Queue(loop=self.loop)
        self.seen_urls = set()
        self.done = []
        self.session = aiohttp.ClientSession(loop=self.loop)
        self.root_domains = set()
        for root in roots:
            parts = urllib.parse.urlparse(root)
            host, port = urllib.parse.splitport(parts.netloc)
            if not host:
                continue
            if re.match(r"\A[\d\.]*\Z", host):
                self.root_domains.add(host)
            else:
                host = host.lower()
                if self.strict:
                    self.root_domains.add(host)
                else:
                    self.root_domains.add(lenient_host(host))
        for root in roots:
            self.add_urls(root)
        self.t0 = time.time()
        self.t1 = None

    def record_statistic(
        self,
        url=None,
        next_url=None,
        status=None,
        exception=None,
        content_type=None,
        encoding=None,
        num_urls=0,
        num_new_urls=0,
    ):
        """Record the FetchStatistic for completed / failed URL."""
        fetch_statistic = FetchStatistic(
            url=url,
            next_url=next_url,
            status=status,
            size=0,
            exception=exception,
            content_type=content_type,
            encoding=encoding,
            num_urls=num_urls,
            num_new_urls=num_new_urls,
        )
        self.done.append(fetch_statistic)

    def extract_data(self, root_url, html):
        raise NotImplementedError("You need to define a extract_data method!")

    def close(self):
        """Close resources."""
        LOGGER.debug("closing resources")
        self.session.close()

    @asyncio.coroutine
    def parse_links(self, web_page_html, base_url, _content_type, _encoding):
        """Return a list of links."""
        links = set()
        tree = html.fromstring(web_page_html)
        tree.make_links_absolute(base_url)
        urls = [link[2] for link in tree.iterlinks()]
        for url in urls:
            defragmented, frag = urllib.parse.urldefrag(url)
            if verify.url_allowed(
                defragmented, self.root_domains, exclude=self.exclude
            ):  # Select Valid links, testing against regexp and root_domains
                links.add(defragmented)
        if urls:
            LOGGER.info(
                "got %r urls from %r new links: %i visited: %i",
                len(urls),
                base_url,
                len(links - self.seen_urls),
                len(self.seen_urls),
            )
        new_links = [link for link in links.difference(self.seen_urls)]

        self.record_statistic(
            url=base_url,
            content_type=_content_type,
            encoding=_encoding,
            num_urls=len(links),
            num_new_urls=len(links - self.seen_urls),
        )
        return new_links

    def handle_redirect(self, response, url, max_redirect):
        location = response.headers["location"]
        next_url = urllib.parse.urljoin(url, location)
        self.record_statistic(url=url, next_url=next_url, status=response.status)
        if next_url in self.seen_urls:
            return
        if max_redirect > 0:
            LOGGER.info("redirect to %r from %r max_redir: %i", next_url, url, max_redirect - 1)
            self.add_urls(next_url, max_redirect - 1)
        else:
            LOGGER.error("redirect limit reached for %r from %r", next_url, url)
        return

    @asyncio.coroutine
    def fetch(self, url, max_redirect, sem):
        """Fetch one URL."""
        tries = 0
        web_page = None
        exception = None
        _url = None
        _encoding = None
        _content_type = None
        sleep_time = 0
        while tries < self.max_tries:
            try:
                with (yield from sem):
                    response = yield from asyncio.wait_for(
                        self.session.get(url, allow_redirects=False), 10, loop=self.loop
                    )
                if tries > 1:
                    LOGGER.debug("try %r for %r success", tries, url)
                break
            except Exception as client_error:
                sleep_time += 5
                yield from asyncio.sleep(sleep_time)
                LOGGER.error("try %r for %r raised %r", tries, url, client_error)
                exception = client_error
            tries += 1
        else:
            # We never broke out of the loop: all tries failed.
            LOGGER.error("%r failed after %r tries", url, self.max_tries)
            self.record_statistic(url=url, exception=exception)
            return (web_page, _url, _content_type, _encoding)
        try:
            _url, _content_type, _encoding = get_content_type_and_encoding(response)
            if is_redirect(response):
                self.handle_redirect(response, url, max_redirect)
                web_page = "redirect"
            elif response.status == 200 and _content_type in ("text/html", "application/xml"):
                web_page = yield from response.text()
            else:
                self.record_statistic(
                    url=response.url, status=response.status, content_type=_content_type, encoding=_encoding
                )
        except Exception as e:
            print("*******error**********")
        finally:
            yield from response.release()
        return (web_page, _url, _content_type, _encoding)

    def add_urls(self, urls, max_redirect=None):
        """Add a URL to the queue if not seen before."""
        if max_redirect is None:
            max_redirect = self.max_redirect
        if not isinstance(urls, str):
            urls = set(urls)
            for link in urls.difference(self.seen_urls):
                self.q.put_nowait((link, max_redirect))
            self.seen_urls.update(urls)
        elif urls not in self.seen_urls:
            self.q.put_nowait((urls, max_redirect))
            self.seen_urls.add(urls)

    @asyncio.coroutine
    def work(self, sem):
        """Process queue items forever."""
        try:
            while True:
                url, max_redirect = yield from self.q.get()
                # assert url in self.seen_urls
                web_page, url, content_type, encoding = yield from self.fetch(url, max_redirect, sem)
                if web_page and web_page != "redirect":
                    new_links = yield from self.parse_links(web_page, url, content_type, encoding)
                    if self.scraper:
                        data = self.scraper.scrape(url, web_page)
                    if self.data_handler:
                        self.data_handler.handle(data)
                    self.add_urls(new_links)
                self.q.task_done()
        except (asyncio.CancelledError,):
            print("error")

    @asyncio.coroutine
    def crawl(self):
        sem = asyncio.Semaphore(value=self.max_connections_per_host, loop=self.loop)
        """Run the crawler until all finished."""
        LOGGER.info("Starting crawl...")
        workers = [asyncio.Task(self.work(sem), loop=self.loop) for _ in range(self.max_tasks)]
        self.t0 = time.time()
        yield from self.q.join()
        self.t1 = time.time()
        for w in workers:
            w.cancel()
Exemplo n.º 40
0
class Crawler:
    """Crawl a set of URLs.

    This manages two sets of URLs: 'urls' and 'done'.  'urls' is a set of
    URLs seen, and 'done' is a list of FetchStatistics.
    """
    def __init__(self, roots,
                 exclude=None, strict=True,  # What to crawl.
                 max_redirect=10, max_tries=4,  # Per-url limits.
                 max_tasks=10, *, loop=None):
        self.loop = loop or asyncio.get_event_loop()
        self.roots = roots
        self.exclude = exclude
        self.strict = strict
        self.max_redirect = max_redirect
        self.max_tries = max_tries
        self.max_tasks = max_tasks
        self.q = Queue(loop=self.loop)
        self.seen_urls = set()
        self.done = []
        self.session = aiohttp.ClientSession(loop=self.loop)
        self.root_domains = set()
        for root in roots:
            parts = urllib.parse.urlparse(root)
            host, port = urllib.parse.splitport(parts.netloc)
            if not host:
                continue
            if re.match(r'\A[\d\.]*\Z', host):
                self.root_domains.add(host)
            else:
                host = host.lower()
                if self.strict:
                    self.root_domains.add(host)
                else:
                    self.root_domains.add(lenient_host(host))
        for root in roots:
            self.add_url(root)
        self.t0 = time.time()
        self.t1 = None

    def close(self):
        """Close resources."""
        self.session.close()

    def host_okay(self, host):
        """Check if a host should be crawled.

        A literal match (after lowercasing) is always good.  For hosts
        that don't look like IP addresses, some approximate matches
        are okay depending on the strict flag.
        """
        host = host.lower()
        if host in self.root_domains:
            return True
        if re.match(r'\A[\d\.]*\Z', host):
            return False
        if self.strict:
            return self._host_okay_strictish(host)
        else:
            return self._host_okay_lenient(host)

    def _host_okay_strictish(self, host):
        """Check if a host should be crawled, strict-ish version.

        This checks for equality modulo an initial 'www.' component.
        """
        host = host[4:] if host.startswith('www.') else 'www.' + host
        return host in self.root_domains

    def _host_okay_lenient(self, host):
        """Check if a host should be crawled, lenient version.

        This compares the last two components of the host.
        """
        return lenient_host(host) in self.root_domains

    def record_statistic(self, fetch_statistic):
        """Record the FetchStatistic for completed / failed URL."""
        self.done.append(fetch_statistic)

    @asyncio.coroutine
    def parse_links(self, response):
        """Return a FetchStatistic and list of links."""
        links = set()
        content_type = None
        encoding = None
        body = yield from response.read()

        if response.status == 200:
            content_type = response.headers.get('content-type')
            pdict = {}

            if content_type:
                content_type, pdict = cgi.parse_header(content_type)

            encoding = pdict.get('charset', 'utf-8')
            if content_type in ('text/html', 'application/xml'):
                text = yield from response.text()

                # Replace href with (?:href|src) to follow image links.
                urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''',text))
                if urls:
                    LOGGER.info('got %r distinct urls from %r',len(urls), response.url)
                for url in urls:
                    normalized = urllib.parse.urljoin(response.url, url)
                    defragmented, frag = urllib.parse.urldefrag(normalized)
                    if self.url_allowed(defragmented):
                        links.add(defragmented)

        stat = FetchStatistic(
            url=response.url,
            next_url=None,
            status=response.status,
            exception=None,
            size=len(body),
            content_type=content_type,
            encoding=encoding,
            num_urls=len(links),
            num_new_urls=len(links - self.seen_urls))

        return stat, links

    @asyncio.coroutine
    def fetch(self, url, max_redirect):
        """Fetch one URL."""
        tries = 0
        exception = None
        while tries < self.max_tries:
            try:
                response = yield from self.session.get(url, allow_redirects=False)  #1
                break  #2
            except aiohttp.ClientError as client_error:
                LOGGER.info('try %r for %r raised %r', tries, url, client_error)
                exception = client_error
        else:
            return
            
        try:
            if is_redirect(response):
                location = response.headers['location']

            else:  #4
                stat, links = yield from self.parse_links(response)
                self.record_statistic(stat)
                for link in links.difference(self.seen_urls):
                    self.q.put_nowait((link, self.max_redirect))
                self.seen_urls.update(links)
        finally:
            yield from response.release()

    @asyncio.coroutine
    def work(self):
        """Process queue items forever."""
        try:
            while True:
                url, max_redirect = yield from self.q.get()  #q.get() Remove and return an item from the queue. If queue is empty, wait until an item is available.
                #print('url',url, 'max_redirect', max_redirect)
                assert url in self.seen_urls   #assert 断言,异常会直接抛出
                yield from self.fetch(url, max_redirect)
                self.q.task_done()  #Indicate that a formerly enqueued task is complete.表明以前排队的任务完成
        except asyncio.CancelledError:
            pass

    def url_allowed(self, url):
        if self.exclude and re.search(self.exclude, url):
            return False
        parts = urllib.parse.urlparse(url)
        if parts.scheme not in ('http', 'https'):
            LOGGER.debug('skipping non-http scheme in %r', url)
            return False
        host, port = urllib.parse.splitport(parts.netloc)
        if not self.host_okay(host):
            LOGGER.debug('skipping non-root host in %r', url)
            return False
        return True

    def add_url(self, url, max_redirect=None):
        """Add a URL to the queue if not seen before."""
        if max_redirect is None:
            max_redirect = self.max_redirect
        LOGGER.debug('adding %r %r', url, max_redirect)
        self.seen_urls.add(url)
        self.q.put_nowait((url, max_redirect))  #put_nowait() Put an item into the queue without blocking.此句实际最先执行

    @asyncio.coroutine
    def crawl(self):
        """Run the crawler until all finished."""
        workers = [asyncio.Task(self.work(), loop=self.loop) for _ in range(self.max_tasks)]
        self.t0 = time.time()
        yield from self.q.join()  #Block until all items in the queue have been gotten and processed.保持阻塞状态,直到处理了队列中的所有项目为止
        self.t1 = time.time()
        for w in workers:
            w.cancel()
Exemplo n.º 41
0
class SubscribeListener(SubscribeCallback):
    def __init__(self):
        self.connected = False
        self.connected_event = Event()
        self.disconnected_event = Event()
        self.presence_queue = Queue()
        self.message_queue = Queue()
        self.error_queue = Queue()

    def status(self, pubnub, status):
        if utils.is_subscribed_event(status) and not self.connected_event.is_set():
            self.connected_event.set()
        elif utils.is_unsubscribed_event(status) and not self.disconnected_event.is_set():
            self.disconnected_event.set()
        elif status.is_error():
            self.error_queue.put_nowait(status.error_data.exception)

    def message(self, pubnub, message):
        self.message_queue.put_nowait(message)

    def presence(self, pubnub, presence):
        self.presence_queue.put_nowait(presence)

    @asyncio.coroutine
    def _wait_for(self, coro):
        scc_task = asyncio.ensure_future(coro)
        err_task = asyncio.ensure_future(self.error_queue.get())

        yield from asyncio.wait([
            scc_task,
            err_task
        ], return_when=asyncio.FIRST_COMPLETED)

        if err_task.done() and not scc_task.done():
            if not scc_task.cancelled():
                scc_task.cancel()
            raise err_task.result()
        else:
            if not err_task.cancelled():
                err_task.cancel()
            return scc_task.result()

    @asyncio.coroutine
    def wait_for_connect(self):
        if not self.connected_event.is_set():
            yield from self._wait_for(self.connected_event.wait())
        else:
            raise Exception("instance is already connected")

    @asyncio.coroutine
    def wait_for_disconnect(self):
        if not self.disconnected_event.is_set():
            yield from self._wait_for(self.disconnected_event.wait())
        else:
            raise Exception("instance is already disconnected")

    @asyncio.coroutine
    def wait_for_message_on(self, *channel_names):
        channel_names = list(channel_names)
        while True:
            try:
                env = yield from self._wait_for(self.message_queue.get())
                if env.channel in channel_names:
                    return env
                else:
                    continue
            finally:
                self.message_queue.task_done()

    @asyncio.coroutine
    def wait_for_presence_on(self, *channel_names):
        channel_names = list(channel_names)
        while True:
            try:
                env = yield from self._wait_for(self.presence_queue.get())
                if env.channel in channel_names:
                    return env
                else:
                    continue
            finally:
                self.presence_queue.task_done()
Exemplo n.º 42
0
class Crawler:
    """Crawl a set of URLs.

    This manages two sets of URLs: 'urls' and 'done'.  'urls' is a set of
    URLs seen, and 'done' is a list of FetchStatistics.
    """

    def __init__(self, roots,
                 exclude=None, strict=True,  # What to crawl.
                 max_redirect=10, max_tries=4,  # Per-url limits.
                 max_tasks=10, *, loop=None):
        self.loop = loop or asyncio.get_event_loop()
        self.roots = roots
        self.exclude = exclude
        self.strict = strict
        self.max_redirect = max_redirect
        self.max_tries = max_tries
        self.max_tasks = max_tasks
        self.q = Queue(loop=self.loop)
        self.seen_urls = BloomFilter(10000000, 0.01)
        self.done = []
        self.session = aiohttp.ClientSession(loop=self.loop)
        self.root_domains = set()
        for root in roots:
            parts = urllib.parse.urlparse(root)
            host, port = urllib.parse.splitport(parts.netloc)
            if not host:
                continue
            if re.match(r'\A[\d\.]*\Z', host):
                self.root_domains.add(host)
            else:
                host = host.lower()
                if self.strict:
                    self.root_domains.add(host)
                else:
                    self.root_domains.add(lenient_host(host))
        for root in roots:
            self.add_url(root)
        self.t0 = time.time()
        self.t1 = None

    def close(self):
        """Close resources."""
        self.session.close()

    def host_okay(self, host):
        """Check if a host should be crawled.

        A literal match (after lowercasing) is always good.  For hosts
        that don't look like IP addresses, some approximate matches
        are okay depending on the strict flag.
        """
        host = host.lower()
        if host in self.root_domains:
            return True
        if re.match(r'\A[\d\.]*\Z', host):
            return False
        if self.strict:
            return self._host_okay_strictish(host)
        else:
            return self._host_okay_lenient(host)

    def _host_okay_strictish(self, host):
        """Check if a host should be crawled, strict-ish version.

        This checks for equality modulo an initial 'www.' component.
        """
        host = host[4:] if host.startswith('www.') else 'www.' + host
        return host in self.root_domains

    def _host_okay_lenient(self, host):
        """Check if a host should be crawled, lenient version.

        This compares the last two components of the host.
        """
        return lenient_host(host) in self.root_domains

    def record_statistic(self, fetch_statistic):
        """Record the FetchStatistic for completed / failed URL."""
        self.done.append(fetch_statistic)

    async def parse_links(self, response):
        """Return a FetchStatistic and list of links."""
        links = set()
        content_type = None
        encoding = None
        body = await response.read()

        if response.status == 200:
            content_type = response.headers.get('content-type')
            pdict = {}

            if content_type:
                content_type, pdict = cgi.parse_header(content_type)

            encoding = pdict.get('charset', 'utf-8')
            if content_type in ('text/html', 'application/xml'):
                text = await response.text()

                # Replace href with (?:href|src) to follow image links.
                urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''',
                                      text))
                if urls:
                    LOGGER.info('got %r distinct urls from %r',
                                len(urls), response.url)
                for url in urls:
                    LOGGER.info("response.url:%s,type:%s",
                                response.url, type(response.url))
                    LOGGER.info("parse_links url:%s,type:%s",
                                url, type(url))
                    normalized = urllib.parse.urljoin(str(response.url), url)
                    defragmented, frag = urllib.parse.urldefrag(normalized)
                    if self.url_allowed(defragmented):
                        links.add(defragmented)

        stat = FetchStatistic(
            url=response.url,
            next_url=None,
            status=response.status,
            exception=None,
            size=len(body),
            content_type=content_type,
            encoding=encoding,
            num_urls=len(links),
            num_new_urls=len(links) - len(self.seen_urls))

        return stat, links

    async def fetch(self, url, max_redirect):
        """Fetch one URL."""
        tries = 0
        exception = None
        while tries < self.max_tries:
            try:
                response = await self.session.get(
                    url, allow_redirects=False)

                if tries > 1:
                    LOGGER.info('try %r for %r success', tries, url)

                break
            except aiohttp.ClientError as client_error:
                LOGGER.info('try %r for %r raised %r',
                            tries, url, client_error)
                exception = client_error

            tries += 1
        else:
            # We never broke out of the loop: all tries failed.
            LOGGER.error('%r failed after %r tries',
                         url, self.max_tries)
            self.record_statistic(FetchStatistic(url=url,
                                                 next_url=None,
                                                 status=None,
                                                 exception=exception,
                                                 size=0,
                                                 content_type=None,
                                                 encoding=None,
                                                 num_urls=0,
                                                 num_new_urls=0))
            return

        try:
            if is_redirect(response):
                location = response.headers['location']
                next_url = urllib.parse.urljoin(url, location)
                self.record_statistic(FetchStatistic(url=url,
                                                     next_url=next_url,
                                                     status=response.status,
                                                     exception=None,
                                                     size=0,
                                                     content_type=None,
                                                     encoding=None,
                                                     num_urls=0,
                                                     num_new_urls=0))

                if next_url in self.seen_urls:
                    return
                if max_redirect > 0:
                    LOGGER.info('redirect to %r from %r', next_url, url)
                    self.add_url(next_url, max_redirect - 1)
                else:
                    LOGGER.error('redirect limit reached for %r from %r',
                                 next_url, url)
            else:
                stat, links = await self.parse_links(response)
                self.record_statistic(stat)
                for link in utils.difference(links, self.seen_urls):

                    # for link in links.difference(self.seen_urls):
                    self.q.put_nowait((link, self.max_redirect))
                # self.seen_urls.update(links)
                self.seen_urls.update(links)
        finally:
            await response.release()

    async def work(self):
        """Process queue items forever."""
        try:
            while True:
                url, max_redirect = await self.q.get()
                assert url in self.seen_urls
                LOGGER.info("url:%s", url)
                LOGGER.info("max_redirect:%s", max_redirect)
                await self.fetch(url, max_redirect)
                self.q.task_done()
        except asyncio.CancelledError:
            pass

    def url_allowed(self, url):
        if self.exclude and re.search(self.exclude, url):
            return False
        parts = urllib.parse.urlparse(url)
        if parts.scheme not in ('http', 'https'):
            LOGGER.debug('skipping non-http scheme in %r', url)
            return False
        host, port = urllib.parse.splitport(parts.netloc)
        if not self.host_okay(host):
            LOGGER.debug('skipping non-root host in %r', url)
            return False
        return True

    def add_url(self, url, max_redirect=None):
        """Add a URL to the queue if not seen before."""
        if max_redirect is None:
            max_redirect = self.max_redirect
        LOGGER.debug('adding %r %r', url, max_redirect)
        self.seen_urls.add(url)
        self.q.put_nowait((url, max_redirect))

    async def crawl(self):
        """Run the crawler until all finished."""
        workers = [asyncio.Task(self.work(), loop=self.loop)
                   for _ in range(self.max_tasks)]
        self.t0 = time.time()
        yield self.q.join()
        self.t1 = time.time()
        for w in workers:
            w.cancel()
Exemplo n.º 43
0
class Crawler:
    def __init__(self, root_url, max_redirect):
        self.max_tasks = 10
        self.max_redirect = max_redirect
        self.q = Queue()
        self.seen_urls = set()

        # aiohttp's ClientSession does connection pooling and
        # HTTP keep-alives for us.
        self.session = aiohttp.ClientSession(loop=loop)

        # Put (URL, max_redirect) in the Queue
        self.q.put((root_url, self.max_redirect))
        
    @asyncio.coroutine
    def crawl(self):
        '''Run the crawler untill all work is done.'''
        workers = [asyncio.Task(self.work())
                   for _ in range(self.max_tasks)]

        # When all work is done, exit.
        yield from self.q.join()
        for w in workers:
            w.cancel()

    @asyncio.coroutine
    def work(self):
        while True:
            url, max_redirect = yield from self.q.get()

            # Download page and add new links to self.q
            yield from self.fetch(url, max_redirect)
            self.q.task_done()

    @asyncio.coroutine
    def fetch(self, url, max_redirect):
        # Handle redirects ourselves.
        response = yield from self.session.get(
            url, allow_redirects=False)

        try:
            if is_redirect(response):
                if max_redirect > 0:
                    next_url = response.headers['location']
                    if next_url in self.seen_urls:
                        # We have done this before.
                        return

                    # Remember we have seen this url.
                    self.seen_urls.add(next_url)

                    # Follow the redirect. One less redirect remains.
                    self.q.put_nowait((next_url, max_redirect -1))
            else:
                links = yield from self.parse_links(response)
                # Python set-logic:
                for link in links.difference(self.seen_urls):
                    self.q.put_nowait((link, self.max_redirect))
                self.seen_urls.update(links)
        finally:
            # Return connection to pool.
            yield from response.release()