Exemplo n.º 1
0
class Crawler:
    def __init__(self, root_url, max_redirect):
        self.max_tasks = 10
        self.max_redirect = max_redirect
        self.q = Queue()
        self.seen_urls = set()

        # aiohttp's ClientSession does connection pooling and
        # HTTP keep-alives for us.
        self.session = aiohttp.ClientSession(loop=loop)

        # Put (URL, max_redirect) in the queue.
        self.q.put((root_url, self.max_redirect))

    @asyncio.coroutine
    def crawl(self):
        """Run the crawler until all work is done."""
        workers = [asyncio.Task(self.work())
                   for _ in range(self.max_tasks)]

        # When all work is done, exit.
        yield from self.q.join()
        for w in workers:
            w.cancel()

    @asyncio.coroutine
    def work(self):
        while True:
            url, max_redirect = yield from self.q.get()

            # Download page and add new links to self.q.
            yield from self.fetch(url, max_redirect)
            # 把新links加入q后再task_done()
            self.q.task_done()
Exemplo n.º 2
0
async def scrape(client, url):
    tasks = []
    url_queue = Queue()

    archive = {
        'top': url,
        'seen': {},
        'items': []
    }
    await url_queue.put(url)

    def task_completed(future):
        exc = future.exception()
        if exc:
            log.error('Worker finished with error: {} '.format(exc), exc_info=True)

    for _ in range(CONCURRENCY):
        crawler_future = ensure_future(crawler(client, url_queue, archive))
        crawler_future.add_done_callback(task_completed)
        tasks.append(crawler_future)

    await wait_for(url_queue.join(), TIMEOUT)

    for task in tasks:
        task.cancel()
    client.close()

    webarchive = {
        'WebMainResource': archive['items'].pop(0),
        'WebSubresources': archive['items']
    }

    writePlist(webarchive, OUTPUT_FILENAME)
Exemplo n.º 3
0
class Crawler:
    def __init__(self, root_url, max_redirect):
        self.max_tasks = 10
        self.max_redirect = max_redirect
        self.q = Queue()
        self.seen_urls = set()

        # aiohttp's ClientSession does connection pooling and
        # HTTP keep-alives for us.
        self.session = aiohttp.ClientSession(loop=loop)

        # Put (URL, max_redirect) in the Queue
        self.q.put((root_url, self.max_redirect))

    @asyncio.coroutine
    def crawl(self):
        '''Run the crawler untill all work is done.'''
        workers = [asyncio.Task(self.work()) for _ in range(self.max_tasks)]

        # When all work is done, exit.
        yield from self.q.join()
        for w in workers:
            w.cancel()

    @asyncio.coroutine
    def work(self):
        while True:
            url, max_redirect = yield from self.q.get()

            # Download page and add new links to self.q
            yield from self.fetch(url, max_redirect)
            self.q.task_done()

    @asyncio.coroutine
    def fetch(self, url, max_redirect):
        # Handle redirects ourselves.
        response = yield from self.session.get(url, allow_redirects=False)

        try:
            if is_redirect(response):
                if max_redirect > 0:
                    next_url = response.headers['location']
                    if next_url in self.seen_urls:
                        # We have done this before.
                        return

                    # Remember we have seen this url.
                    self.seen_urls.add(next_url)

                    # Follow the redirect. One less redirect remains.
                    self.q.put_nowait((next_url, max_redirect - 1))
            else:
                links = yield from self.parse_links(response)
                # Python set-logic:
                for link in links.difference(self.seen_urls):
                    self.q.put_nowait((link, self.max_redirect))
                self.seen_urls.update(links)
        finally:
            # Return connection to pool.
            yield from response.release()
Exemplo n.º 4
0
class AbstractQueue(AbstractDataset, ABC):
    def __init__(self, maxsize=3, *args):
        super().__init__(*args)

        # initialise queue
        self.queue = Queue(maxsize=maxsize)
        return

    # ********************************************** #

    def __iter__(self):
        # self.background_worker = Thread(target=self.enqueue(), daemon=True)
        # self.background_worker.start()
        return self

    # ********************************************** #

    def __next__(self):
        if not self.queue.empty():
            batch = self.queue.get()
            self.queue.task_done()
            return batch
        else:
            raise StopIteration

    async def request_sample(self, idx):
        start = time.time()
        print("Fetching new sample...", end="\r")
        sample = await self.samples[idx]
        print("Fetched in %4f" % (time.time() - start))
        return sample

    def _producer(self):
        # this is async
        for frame in self.frames:
            t = Thread(target=self.request_sample, args=frame)
            self.queue.put(t)
        self.queue.join()
        return

    def print_random(self):
        idx = rn.randint(0, len(self))
        sample = self.request_sample(idx)
        print(f"Example batch {idx:s}")
        print(sample)
Exemplo n.º 5
0
 async def _await_job(queue: asyncio.Queue, event: asyncio.Event,
                      /) -> None:
     job_done = False
     while not (event.is_set() or job_done):
         try:
             await asyncio.wait_for(queue.join(), timeout=1)
         except asyncio.TimeoutError:
             pass
         else:
             job_done = True
Exemplo n.º 6
0
async def main():
    customer_queue = Queue()

    all_products = [Product('beer', 2),
                    Product('bananas', .5),
                    Product('sausage', .2),
                    Product('diapers', .2)]

    for i in range(10): #C
        products = [all_products[randrange(len(all_products))]
                    for _ in range(randrange(10))]
        customer_queue.put_nowait(Customer(i, products))

    cashiers = [asyncio.create_task(checkout_customer(customer_queue, i))
                for i in range(3)] #D

    await asyncio.gather(customer_queue.join(), *cashiers)
Exemplo n.º 7
0
class Crawler:
    def __init__(self,roots,max_task=5):
        self.roots = roots
        self.queue = Queue()
        self.max_task = max_task
        #self.session = aiohttp.ClientSession()
        self.seen_urls = set()
        for root in roots:
            self.add_url(root)

    def add_url(self,root):
        if root in self.seen_urls:
            return
        self.queue.put_nowait(root)

    @asyncio.coroutine
    def crawler(self):
        tasks = [asyncio.Task(self.work())
                        for _ in range(self.max_task)]
        #pdb.set_trace()
        yield from self.queue.join()
        for w in tasks:
            w.cancel()  # w is the object of Task

    @asyncio.coroutine
    def work(self):
        '''consume coroutine'''
        try:
            while True:
            # when the queue has no links , the queue.get blocked
                url = yield from self.queue.get()
                print('{}'.format(threading.currentThread()))
                yield from self.fetch(url)
                self.queue.task_done()
        except asyncio.CancelledError:
            pass

    @asyncio.coroutine
    def fetch(self,url):
        yield from asyncio.sleep(1)
Exemplo n.º 8
0
class Crawler:

    def __init__(self, domain, max_redirects=10,
                 max_retries=3, max_tasks=10):
        self.domain = domain
        self.max_redirects = max_redirects
        self.max_tasks = max_tasks
        self.max_retries = max_retries
        # self.loop = loop or asyncio.get_event_loop()
        self.q = Queue()
        self.urls_seen = set()
        self.session = aiohttp.ClientSession()
        self.add_url('/')

    @asyncio.coroutine
    def crawl(self):
        workers = [asyncio.Task(self.work())
                   for _ in range(self.max_tasks)]
        yield from self.q.join()
        for worker in workers:
            worker.cancel()

    def close(self):
        self.session.close()

    @asyncio.coroutine
    def work(self):
        try:
            while True:
                url, max_redirects = yield from self.q.get()
                # LOGGER.debug('fetching {}'.format(url))
                yield from self.fetch(url, max_redirects)
                self.q.task_done()
        except asyncio.CancelledError:
            pass

    @asyncio.coroutine
    def fetch(self, url, max_redirects):
        retry = 0
        while retry < self.max_retries:
            try:
                response = yield from self.session.get(self.domain+url, allow_redirects=False)
                LOGGER.debug('fetched {}'.format(url))
                break
            except aiohttp.ClientError as client_err:
                retry += 1
                LOGGER.info('fetching {} failed {} times with error {}'.format(url, retry, client_err))
            except Exception as e:
                LOGGER.error('fetching {} with error: {}'.format(url, e))
                return
        else:
            LOGGER.error('fetching {} out of max retry times'.format(url))
            return

        if self.is_redirect(response):
            location = response.headers['location']
            next_url = urllib.parse.urlparse(location).path
            next_url = urllib.parse.urljoin(url, next_url)
            if next_url in self.urls_seen:
                pass
            elif max_redirects > 0:
                self.add_url(next_url, max_redirects-1)
                LOGGER.info('redirect from {} to {}'.format(url, next_url))
            else:
                LOGGER.error('redirect from {} to {} out of times'.format(url, next_url))
        else:
            links = yield from self.parse_links(response)
            LOGGER.debug('parsed {} links from {}'.format(len(links), url))
            for link in links.difference(self.urls_seen):
                self.q.put_nowait((link, self.max_redirects))
            self.urls_seen.update(links)
        yield from response.release()

    def add_url(self, url, max_redirects=None):
        max_redi = max_redirects or self.max_redirects
        self.urls_seen.add(url)
        self.q.put_nowait((url, max_redi))

    def is_redirect(self, response):
        return response.status in (300, 301, 302, 303, 307)

    @asyncio.coroutine
    def parse_links(self, response):
        links = set()
        if response.status == 200:
            content_type = response.headers.get('content-type', '')
            if content_type and content_type.startswith('text/html'):
                text = yield from response.text()
                urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''',
                                      text))
                if urls:
                    LOGGER.info('got {} distinct urls from {}'.format(
                                len(urls), response.url))
                for url in urls:
                    norm_url = urllib.parse.urljoin(response.url, url)
                    url_parts = urllib.parse.urlparse(norm_url)
                    if url_parts.scheme not in ('http', 'https', ''):
                        continue
                    host, port = urllib.parse.splitport(url_parts.netloc)
                    host = host.lower()
                    host = host[4:] if host.startswith('www.') else host
                    if host and not host in self.domain:
                        continue
                    defragmented, frag = urllib.parse.urldefrag(url_parts.path)
                    links.add(defragmented)
        return links
Exemplo n.º 9
0
			

def workerTask(q):
	while not q.empty():
		processImage(q.get_nowait()[0])
		q.task_done()
		
if not os.path.exists("__working"):
	os.mkdir("__working")
convertPdfs(pdfList)
q = Queue(maxsize=0)
num_threads = 4

#put files in queue
for fileName in os.listdir("__working"):
	if fileName.endswith(".pbm"):
		q.put_nowait(("__working/" + fileName,))

threads = []
for i in range(num_threads):
	worker = Thread(target=workerTask, args=(q,))
	worker.start()
	threads.append(worker)

q.join()
for thread in threads:
	thread.join()

subprocess.run("rm -r __working", shell=True)

Exemplo n.º 10
0
class Crawler:
    def __init__(self, root_url, max_redirect):
        self.max_tasks = 10
        self.max_redirect = max_redirect
        self.q = Queue()
        self.seen_urls = set()

        # aiohttp's ClientSession does connection pooling and
        # HTTP keep-alives for us.
        self.session = aiohttp.ClientSession(loop=loop)

        # Put (URL, max_redirect) in the Queue
        self.q.put((root_url, self.max_redirect))
        
    @asyncio.coroutine
    def crawl(self):
        '''Run the crawler untill all work is done.'''
        workers = [asyncio.Task(self.work())
                   for _ in range(self.max_tasks)]

        # When all work is done, exit.
        yield from self.q.join()
        for w in workers:
            w.cancel()

    @asyncio.coroutine
    def work(self):
        while True:
            url, max_redirect = yield from self.q.get()

            # Download page and add new links to self.q
            yield from self.fetch(url, max_redirect)
            self.q.task_done()

    @asyncio.coroutine
    def fetch(self, url, max_redirect):
        # Handle redirects ourselves.
        response = yield from self.session.get(
            url, allow_redirects=False)

        try:
            if is_redirect(response):
                if max_redirect > 0:
                    next_url = response.headers['location']
                    if next_url in self.seen_urls:
                        # We have done this before.
                        return

                    # Remember we have seen this url.
                    self.seen_urls.add(next_url)

                    # Follow the redirect. One less redirect remains.
                    self.q.put_nowait((next_url, max_redirect -1))
            else:
                links = yield from self.parse_links(response)
                # Python set-logic:
                for link in links.difference(self.seen_urls):
                    self.q.put_nowait((link, self.max_redirect))
                self.seen_urls.update(links)
        finally:
            # Return connection to pool.
            yield from response.release()
Exemplo n.º 11
0
class Crawler:
    """Crawl a set of URLs.

    This manages two sets of URLs: 'urls' and 'done'.  'urls' is a set of
    URLs seen, and 'done' is a list of FetchStatistics.
    """

    def __init__(self, roots,
                 exclude=None, strict=True,  # What to crawl.
                 max_redirect=10, max_tries=4,  # Per-url limits.
                 max_tasks=10, *, loop=None):
        self.loop = loop or asyncio.get_event_loop()
        self.roots = roots
        self.exclude = exclude
        self.strict = strict
        self.max_redirect = max_redirect
        self.max_tries = max_tries
        self.max_tasks = max_tasks
        self.q = Queue(loop=self.loop)
        self.seen_urls = BloomFilter(10000000, 0.01)
        self.done = []
        self.session = aiohttp.ClientSession(loop=self.loop)
        self.root_domains = set()
        for root in roots:
            parts = urllib.parse.urlparse(root)
            host, port = urllib.parse.splitport(parts.netloc)
            if not host:
                continue
            if re.match(r'\A[\d\.]*\Z', host):
                self.root_domains.add(host)
            else:
                host = host.lower()
                if self.strict:
                    self.root_domains.add(host)
                else:
                    self.root_domains.add(lenient_host(host))
        for root in roots:
            self.add_url(root)
        self.t0 = time.time()
        self.t1 = None

    def close(self):
        """Close resources."""
        self.session.close()

    def host_okay(self, host):
        """Check if a host should be crawled.

        A literal match (after lowercasing) is always good.  For hosts
        that don't look like IP addresses, some approximate matches
        are okay depending on the strict flag.
        """
        host = host.lower()
        if host in self.root_domains:
            return True
        if re.match(r'\A[\d\.]*\Z', host):
            return False
        if self.strict:
            return self._host_okay_strictish(host)
        else:
            return self._host_okay_lenient(host)

    def _host_okay_strictish(self, host):
        """Check if a host should be crawled, strict-ish version.

        This checks for equality modulo an initial 'www.' component.
        """
        host = host[4:] if host.startswith('www.') else 'www.' + host
        return host in self.root_domains

    def _host_okay_lenient(self, host):
        """Check if a host should be crawled, lenient version.

        This compares the last two components of the host.
        """
        return lenient_host(host) in self.root_domains

    def record_statistic(self, fetch_statistic):
        """Record the FetchStatistic for completed / failed URL."""
        self.done.append(fetch_statistic)

    async def parse_links(self, response):
        """Return a FetchStatistic and list of links."""
        links = set()
        content_type = None
        encoding = None
        body = await response.read()

        if response.status == 200:
            content_type = response.headers.get('content-type')
            pdict = {}

            if content_type:
                content_type, pdict = cgi.parse_header(content_type)

            encoding = pdict.get('charset', 'utf-8')
            if content_type in ('text/html', 'application/xml'):
                text = await response.text()

                # Replace href with (?:href|src) to follow image links.
                urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''',
                                      text))
                if urls:
                    LOGGER.info('got %r distinct urls from %r',
                                len(urls), response.url)
                for url in urls:
                    LOGGER.info("response.url:%s,type:%s",
                                response.url, type(response.url))
                    LOGGER.info("parse_links url:%s,type:%s",
                                url, type(url))
                    normalized = urllib.parse.urljoin(str(response.url), url)
                    defragmented, frag = urllib.parse.urldefrag(normalized)
                    if self.url_allowed(defragmented):
                        links.add(defragmented)

        stat = FetchStatistic(
            url=response.url,
            next_url=None,
            status=response.status,
            exception=None,
            size=len(body),
            content_type=content_type,
            encoding=encoding,
            num_urls=len(links),
            num_new_urls=len(links) - len(self.seen_urls))

        return stat, links

    async def fetch(self, url, max_redirect):
        """Fetch one URL."""
        tries = 0
        exception = None
        while tries < self.max_tries:
            try:
                response = await self.session.get(
                    url, allow_redirects=False)

                if tries > 1:
                    LOGGER.info('try %r for %r success', tries, url)

                break
            except aiohttp.ClientError as client_error:
                LOGGER.info('try %r for %r raised %r',
                            tries, url, client_error)
                exception = client_error

            tries += 1
        else:
            # We never broke out of the loop: all tries failed.
            LOGGER.error('%r failed after %r tries',
                         url, self.max_tries)
            self.record_statistic(FetchStatistic(url=url,
                                                 next_url=None,
                                                 status=None,
                                                 exception=exception,
                                                 size=0,
                                                 content_type=None,
                                                 encoding=None,
                                                 num_urls=0,
                                                 num_new_urls=0))
            return

        try:
            if is_redirect(response):
                location = response.headers['location']
                next_url = urllib.parse.urljoin(url, location)
                self.record_statistic(FetchStatistic(url=url,
                                                     next_url=next_url,
                                                     status=response.status,
                                                     exception=None,
                                                     size=0,
                                                     content_type=None,
                                                     encoding=None,
                                                     num_urls=0,
                                                     num_new_urls=0))

                if next_url in self.seen_urls:
                    return
                if max_redirect > 0:
                    LOGGER.info('redirect to %r from %r', next_url, url)
                    self.add_url(next_url, max_redirect - 1)
                else:
                    LOGGER.error('redirect limit reached for %r from %r',
                                 next_url, url)
            else:
                stat, links = await self.parse_links(response)
                self.record_statistic(stat)
                for link in utils.difference(links, self.seen_urls):

                    # for link in links.difference(self.seen_urls):
                    self.q.put_nowait((link, self.max_redirect))
                # self.seen_urls.update(links)
                self.seen_urls.update(links)
        finally:
            await response.release()

    async def work(self):
        """Process queue items forever."""
        try:
            while True:
                url, max_redirect = await self.q.get()
                assert url in self.seen_urls
                LOGGER.info("url:%s", url)
                LOGGER.info("max_redirect:%s", max_redirect)
                await self.fetch(url, max_redirect)
                self.q.task_done()
        except asyncio.CancelledError:
            pass

    def url_allowed(self, url):
        if self.exclude and re.search(self.exclude, url):
            return False
        parts = urllib.parse.urlparse(url)
        if parts.scheme not in ('http', 'https'):
            LOGGER.debug('skipping non-http scheme in %r', url)
            return False
        host, port = urllib.parse.splitport(parts.netloc)
        if not self.host_okay(host):
            LOGGER.debug('skipping non-root host in %r', url)
            return False
        return True

    def add_url(self, url, max_redirect=None):
        """Add a URL to the queue if not seen before."""
        if max_redirect is None:
            max_redirect = self.max_redirect
        LOGGER.debug('adding %r %r', url, max_redirect)
        self.seen_urls.add(url)
        self.q.put_nowait((url, max_redirect))

    async def crawl(self):
        """Run the crawler until all finished."""
        workers = [asyncio.Task(self.work(), loop=self.loop)
                   for _ in range(self.max_tasks)]
        self.t0 = time.time()
        yield self.q.join()
        self.t1 = time.time()
        for w in workers:
            w.cancel()
Exemplo n.º 12
0
class Scraper:
    """Crawl a set of URLs.

    This manages two sets of URLs: 'urls' and 'done'.  'urls' is a set of
    URLs seen, and 'done' is a list of FetchStatistics.
    """
    def __init__(
            self,
            roots,
            exclude=None,
            strict=True,  # What to crawl.
            max_redirect=10,
            max_tries=4,  # Per-url limits.
            max_tasks=10,
            *,
            loop=None):
        self.loop = loop or asyncio.get_event_loop()
        self.roots = roots
        self.exclude = exclude
        self.strict = strict
        self.max_redirect = max_redirect
        self.max_tries = max_tries
        self.max_tasks = max_tasks
        self.q = Queue(loop=self.loop)
        self.seen_urls = set()
        self.done = []
        self.session = aiohttp.ClientSession(loop=self.loop)
        self.root_domains = set()
        for root in roots:
            parts = urllib.parse.urlparse(root)
            host, port = urllib.parse.splitport(parts.netloc)
            if not host:
                continue
            if re.match(r'\A[\d\.]*\Z', host):
                self.root_domains.add(host)
            else:
                host = host.lower()
                if self.strict:
                    self.root_domains.add(host)
                else:
                    self.root_domains.add(lenient_host(host))
        for root in roots:
            self.add_url(root)
        self.t0 = time.time()
        self.t1 = None

    def close(self):
        """Close resources."""
        self.session.close()

    def host_okay(self, host):
        """Check if a host should be crawled.

        A literal match (after lowercasing) is always good.  For hosts
        that don't look like IP addresses, some approximate matches
        are okay depending on the strict flag.
        """
        host = host.lower()
        if host in self.root_domains:
            return True
        if re.match(r'\A[\d\.]*\Z', host):
            return False
        if self.strict:
            return self._host_okay_strictish(host)
        else:
            return self._host_okay_lenient(host)

    def _host_okay_strictish(self, host):
        """Check if a host should be crawled, strict-ish version.

        This checks for equality modulo an initial 'www.' component.
        """
        host = host[4:] if host.startswith('www.') else 'www.' + host
        return host in self.root_domains

    def _host_okay_lenient(self, host):
        """Check if a host should be crawled, lenient version.

        This compares the last two components of the host.
        """
        return lenient_host(host) in self.root_domains

    def record_statistic(self, fetch_statistic):
        """Record the FetchStatistic for completed / failed URL."""
        self.done.append(fetch_statistic)

    @asyncio.coroutine
    def parse_links(self, response):
        """Return a FetchStatistic and list of links."""
        links = set()
        content_type = None
        encoding = None
        body = yield from response.read()

        if response.status == 200:
            content_type = response.headers.get('content-type')
            pdict = {}

            if content_type:
                content_type, pdict = cgi.parse_header(content_type)

            encoding = pdict.get('charset', 'utf-8')
            if content_type in ('text/html', 'application/xml'):
                text = yield from response.text()

                # Replace href with (?:href|src) to follow image links.
                urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''', text))
                if urls:
                    LOGGER.info('got %r distinct urls from %r', len(urls),
                                response.url)
                for url in urls:
                    normalized = urllib.parse.urljoin(response.url, url)
                    defragmented, frag = urllib.parse.urldefrag(normalized)
                    if self.url_allowed(defragmented):
                        links.add(defragmented)

        stat = FetchStatistic(url=response.url,
                              next_url=None,
                              status=response.status,
                              exception=None,
                              size=len(body),
                              content_type=content_type,
                              encoding=encoding,
                              num_urls=len(links),
                              num_new_urls=len(links - self.seen_urls))

        return stat, links

    @asyncio.coroutine
    def fetch(self, url, max_redirect):
        """Fetch one URL."""
        tries = 0
        exception = None
        while tries < self.max_tries:
            try:
                response = yield from self.session.get(url,
                                                       allow_redirects=False)

                if tries > 1:
                    LOGGER.info('try %r for %r success', tries, url)

                break
            except aiohttp.ClientError as client_error:
                LOGGER.info('try %r for %r raised %r', tries, url,
                            client_error)
                exception = client_error

            tries += 1
        else:
            # We never broke out of the loop: all tries failed.
            LOGGER.error('%r failed after %r tries', url, self.max_tries)
            self.record_statistic(
                FetchStatistic(url=url,
                               next_url=None,
                               status=None,
                               exception=exception,
                               size=0,
                               content_type=None,
                               encoding=None,
                               num_urls=0,
                               num_new_urls=0))
            return

        try:
            if is_redirect(response):
                location = response.headers['location']
                next_url = urllib.parse.urljoin(url, location)
                self.record_statistic(
                    FetchStatistic(url=url,
                                   next_url=next_url,
                                   status=response.status,
                                   exception=None,
                                   size=0,
                                   content_type=None,
                                   encoding=None,
                                   num_urls=0,
                                   num_new_urls=0))

                if next_url in self.seen_urls:
                    return
                if max_redirect > 0:
                    LOGGER.info('redirect to %r from %r', next_url, url)
                    self.add_url(next_url, max_redirect - 1)
                else:
                    LOGGER.error('redirect limit reached for %r from %r',
                                 next_url, url)
            else:
                stat, links = yield from self.parse_links(response)
                self.record_statistic(stat)
                for link in links.difference(self.seen_urls):
                    self.q.put_nowait((link, self.max_redirect))
                self.seen_urls.update(links)
        finally:
            yield from response.release()

    @asyncio.coroutine
    def work(self):
        """Process queue items forever."""
        try:
            while True:
                url, max_redirect = yield from self.q.get()
                assert url in self.seen_urls
                yield from self.fetch(url, max_redirect)
                self.q.task_done()
        except asyncio.CancelledError:
            pass

    def url_allowed(self, url):
        if self.exclude and re.search(self.exclude, url):
            return False
        parts = urllib.parse.urlparse(url)
        if parts.scheme not in ('http', 'https'):
            LOGGER.debug('skipping non-http scheme in %r', url)
            return False
        host, port = urllib.parse.splitport(parts.netloc)
        if not self.host_okay(host):
            LOGGER.debug('skipping non-root host in %r', url)
            return False
        return True

    def add_url(self, url, max_redirect=None):
        """Add a URL to the queue if not seen before."""
        if max_redirect is None:
            max_redirect = self.max_redirect
        LOGGER.debug('adding %r %r', url, max_redirect)
        self.seen_urls.add(url)
        self.q.put_nowait((url, max_redirect))

    @asyncio.coroutine
    def scrape(self):
        """Run the crawler until all finished."""
        workers = [
            asyncio.Task(self.work(), loop=self.loop)
            for _ in range(self.max_tasks)
        ]
        self.t0 = time.time()
        yield from self.q.join()
        self.t1 = time.time()
        for w in workers:
            w.cancel()
Exemplo n.º 13
0
class Crawler:
    """Crawl a set of URLs.

    This manages two sets of URLs: 'urls' and 'done'.  'urls' is a set of
    URLs seen, and 'done' is a list of FetchStatistics.
    """
    def __init__(self, roots,
                 exclude=None, strict=True,  # What to crawl.
                 max_redirect=10, max_tries=4,  # Per-url limits.
                 max_tasks=10, *, loop=None):
        self.loop = loop or asyncio.get_event_loop()
        self.roots = roots
        self.exclude = exclude
        self.strict = strict
        self.max_redirect = max_redirect
        self.max_tries = max_tries
        self.max_tasks = max_tasks
        self.q = Queue(loop=self.loop)
        self.seen_urls = set()
        self.done = []
        self.session = aiohttp.ClientSession(loop=self.loop)
        self.root_domains = set()
        for root in roots:
            parts = urllib.parse.urlparse(root)
            host, port = urllib.parse.splitport(parts.netloc)
            if not host:
                continue
            if re.match(r'\A[\d\.]*\Z', host):
                self.root_domains.add(host)
            else:
                host = host.lower()
                if self.strict:
                    self.root_domains.add(host)
                else:
                    self.root_domains.add(lenient_host(host))
        for root in roots:
            self.add_url(root)
        self.t0 = time.time()
        self.t1 = None

    def close(self):
        """Close resources."""
        self.session.close()

    def host_okay(self, host):
        """Check if a host should be crawled.

        A literal match (after lowercasing) is always good.  For hosts
        that don't look like IP addresses, some approximate matches
        are okay depending on the strict flag.
        """
        host = host.lower()
        if host in self.root_domains:
            return True
        if re.match(r'\A[\d\.]*\Z', host):
            return False
        if self.strict:
            return self._host_okay_strictish(host)
        else:
            return self._host_okay_lenient(host)

    def _host_okay_strictish(self, host):
        """Check if a host should be crawled, strict-ish version.

        This checks for equality modulo an initial 'www.' component.
        """
        host = host[4:] if host.startswith('www.') else 'www.' + host
        return host in self.root_domains

    def _host_okay_lenient(self, host):
        """Check if a host should be crawled, lenient version.

        This compares the last two components of the host.
        """
        return lenient_host(host) in self.root_domains

    def record_statistic(self, fetch_statistic):
        """Record the FetchStatistic for completed / failed URL."""
        self.done.append(fetch_statistic)

    @asyncio.coroutine
    def parse_links(self, response):
        """Return a FetchStatistic and list of links."""
        links = set()
        content_type = None
        encoding = None
        body = yield from response.read()

        if response.status == 200:
            content_type = response.headers.get('content-type')
            pdict = {}

            if content_type:
                content_type, pdict = cgi.parse_header(content_type)

            encoding = pdict.get('charset', 'utf-8')
            if content_type in ('text/html', 'application/xml'):
                text = yield from response.text()

                # Replace href with (?:href|src) to follow image links.
                urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''',text))
                if urls:
                    LOGGER.info('got %r distinct urls from %r',len(urls), response.url)
                for url in urls:
                    normalized = urllib.parse.urljoin(response.url, url)
                    defragmented, frag = urllib.parse.urldefrag(normalized)
                    if self.url_allowed(defragmented):
                        links.add(defragmented)

        stat = FetchStatistic(
            url=response.url,
            next_url=None,
            status=response.status,
            exception=None,
            size=len(body),
            content_type=content_type,
            encoding=encoding,
            num_urls=len(links),
            num_new_urls=len(links - self.seen_urls))

        return stat, links

    @asyncio.coroutine
    def fetch(self, url, max_redirect):
        """Fetch one URL."""
        tries = 0
        exception = None
        while tries < self.max_tries:
            try:
                response = yield from self.session.get(url, allow_redirects=False)  #1
                break  #2
            except aiohttp.ClientError as client_error:
                LOGGER.info('try %r for %r raised %r', tries, url, client_error)
                exception = client_error
        else:
            return
            
        try:
            if is_redirect(response):
                location = response.headers['location']

            else:  #4
                stat, links = yield from self.parse_links(response)
                self.record_statistic(stat)
                for link in links.difference(self.seen_urls):
                    self.q.put_nowait((link, self.max_redirect))
                self.seen_urls.update(links)
        finally:
            yield from response.release()

    @asyncio.coroutine
    def work(self):
        """Process queue items forever."""
        try:
            while True:
                url, max_redirect = yield from self.q.get()  #q.get() Remove and return an item from the queue. If queue is empty, wait until an item is available.
                #print('url',url, 'max_redirect', max_redirect)
                assert url in self.seen_urls   #assert 断言,异常会直接抛出
                yield from self.fetch(url, max_redirect)
                self.q.task_done()  #Indicate that a formerly enqueued task is complete.表明以前排队的任务完成
        except asyncio.CancelledError:
            pass

    def url_allowed(self, url):
        if self.exclude and re.search(self.exclude, url):
            return False
        parts = urllib.parse.urlparse(url)
        if parts.scheme not in ('http', 'https'):
            LOGGER.debug('skipping non-http scheme in %r', url)
            return False
        host, port = urllib.parse.splitport(parts.netloc)
        if not self.host_okay(host):
            LOGGER.debug('skipping non-root host in %r', url)
            return False
        return True

    def add_url(self, url, max_redirect=None):
        """Add a URL to the queue if not seen before."""
        if max_redirect is None:
            max_redirect = self.max_redirect
        LOGGER.debug('adding %r %r', url, max_redirect)
        self.seen_urls.add(url)
        self.q.put_nowait((url, max_redirect))  #put_nowait() Put an item into the queue without blocking.此句实际最先执行

    @asyncio.coroutine
    def crawl(self):
        """Run the crawler until all finished."""
        workers = [asyncio.Task(self.work(), loop=self.loop) for _ in range(self.max_tasks)]
        self.t0 = time.time()
        yield from self.q.join()  #Block until all items in the queue have been gotten and processed.保持阻塞状态,直到处理了队列中的所有项目为止
        self.t1 = time.time()
        for w in workers:
            w.cancel()
Exemplo n.º 14
0
class Crawler(object):
    """Crawl a set of URLs.

    This manages two sets of URLs: 'urls' and 'done'.  'urls' is a set of
    URLs seen, and 'done' is a list of FetchStatistics.
    """

    def __init__(
        self,
        roots,
        scraper=None,
        data_handler=None,
        exclude=None,
        strict=True,  # What to crawl.
        max_redirect=5,
        max_tries=10,  # Per-url limits.
        max_tasks=10,
        max_connections_per_host=3,
        *,
        loop=None
    ):
        self.loop = loop or asyncio.get_event_loop()
        self.roots = roots
        self.max_connections_per_host = max_connections_per_host
        self.scraper = scraper
        self.data_handler = data_handler
        self.exclude = exclude
        self.strict = strict
        self.max_redirect = max_redirect
        self.max_tries = max_tries
        self.max_tasks = max_tasks
        self.q = Queue(loop=self.loop)
        self.seen_urls = set()
        self.done = []
        self.session = aiohttp.ClientSession(loop=self.loop)
        self.root_domains = set()
        for root in roots:
            parts = urllib.parse.urlparse(root)
            host, port = urllib.parse.splitport(parts.netloc)
            if not host:
                continue
            if re.match(r"\A[\d\.]*\Z", host):
                self.root_domains.add(host)
            else:
                host = host.lower()
                if self.strict:
                    self.root_domains.add(host)
                else:
                    self.root_domains.add(lenient_host(host))
        for root in roots:
            self.add_urls(root)
        self.t0 = time.time()
        self.t1 = None

    def record_statistic(
        self,
        url=None,
        next_url=None,
        status=None,
        exception=None,
        content_type=None,
        encoding=None,
        num_urls=0,
        num_new_urls=0,
    ):
        """Record the FetchStatistic for completed / failed URL."""
        fetch_statistic = FetchStatistic(
            url=url,
            next_url=next_url,
            status=status,
            size=0,
            exception=exception,
            content_type=content_type,
            encoding=encoding,
            num_urls=num_urls,
            num_new_urls=num_new_urls,
        )
        self.done.append(fetch_statistic)

    def extract_data(self, root_url, html):
        raise NotImplementedError("You need to define a extract_data method!")

    def close(self):
        """Close resources."""
        LOGGER.debug("closing resources")
        self.session.close()

    @asyncio.coroutine
    def parse_links(self, web_page_html, base_url, _content_type, _encoding):
        """Return a list of links."""
        links = set()
        tree = html.fromstring(web_page_html)
        tree.make_links_absolute(base_url)
        urls = [link[2] for link in tree.iterlinks()]
        for url in urls:
            defragmented, frag = urllib.parse.urldefrag(url)
            if verify.url_allowed(
                defragmented, self.root_domains, exclude=self.exclude
            ):  # Select Valid links, testing against regexp and root_domains
                links.add(defragmented)
        if urls:
            LOGGER.info(
                "got %r urls from %r new links: %i visited: %i",
                len(urls),
                base_url,
                len(links - self.seen_urls),
                len(self.seen_urls),
            )
        new_links = [link for link in links.difference(self.seen_urls)]

        self.record_statistic(
            url=base_url,
            content_type=_content_type,
            encoding=_encoding,
            num_urls=len(links),
            num_new_urls=len(links - self.seen_urls),
        )
        return new_links

    def handle_redirect(self, response, url, max_redirect):
        location = response.headers["location"]
        next_url = urllib.parse.urljoin(url, location)
        self.record_statistic(url=url, next_url=next_url, status=response.status)
        if next_url in self.seen_urls:
            return
        if max_redirect > 0:
            LOGGER.info("redirect to %r from %r max_redir: %i", next_url, url, max_redirect - 1)
            self.add_urls(next_url, max_redirect - 1)
        else:
            LOGGER.error("redirect limit reached for %r from %r", next_url, url)
        return

    @asyncio.coroutine
    def fetch(self, url, max_redirect, sem):
        """Fetch one URL."""
        tries = 0
        web_page = None
        exception = None
        _url = None
        _encoding = None
        _content_type = None
        sleep_time = 0
        while tries < self.max_tries:
            try:
                with (yield from sem):
                    response = yield from asyncio.wait_for(
                        self.session.get(url, allow_redirects=False), 10, loop=self.loop
                    )
                if tries > 1:
                    LOGGER.debug("try %r for %r success", tries, url)
                break
            except Exception as client_error:
                sleep_time += 5
                yield from asyncio.sleep(sleep_time)
                LOGGER.error("try %r for %r raised %r", tries, url, client_error)
                exception = client_error
            tries += 1
        else:
            # We never broke out of the loop: all tries failed.
            LOGGER.error("%r failed after %r tries", url, self.max_tries)
            self.record_statistic(url=url, exception=exception)
            return (web_page, _url, _content_type, _encoding)
        try:
            _url, _content_type, _encoding = get_content_type_and_encoding(response)
            if is_redirect(response):
                self.handle_redirect(response, url, max_redirect)
                web_page = "redirect"
            elif response.status == 200 and _content_type in ("text/html", "application/xml"):
                web_page = yield from response.text()
            else:
                self.record_statistic(
                    url=response.url, status=response.status, content_type=_content_type, encoding=_encoding
                )
        except Exception as e:
            print("*******error**********")
        finally:
            yield from response.release()
        return (web_page, _url, _content_type, _encoding)

    def add_urls(self, urls, max_redirect=None):
        """Add a URL to the queue if not seen before."""
        if max_redirect is None:
            max_redirect = self.max_redirect
        if not isinstance(urls, str):
            urls = set(urls)
            for link in urls.difference(self.seen_urls):
                self.q.put_nowait((link, max_redirect))
            self.seen_urls.update(urls)
        elif urls not in self.seen_urls:
            self.q.put_nowait((urls, max_redirect))
            self.seen_urls.add(urls)

    @asyncio.coroutine
    def work(self, sem):
        """Process queue items forever."""
        try:
            while True:
                url, max_redirect = yield from self.q.get()
                # assert url in self.seen_urls
                web_page, url, content_type, encoding = yield from self.fetch(url, max_redirect, sem)
                if web_page and web_page != "redirect":
                    new_links = yield from self.parse_links(web_page, url, content_type, encoding)
                    if self.scraper:
                        data = self.scraper.scrape(url, web_page)
                    if self.data_handler:
                        self.data_handler.handle(data)
                    self.add_urls(new_links)
                self.q.task_done()
        except (asyncio.CancelledError,):
            print("error")

    @asyncio.coroutine
    def crawl(self):
        sem = asyncio.Semaphore(value=self.max_connections_per_host, loop=self.loop)
        """Run the crawler until all finished."""
        LOGGER.info("Starting crawl...")
        workers = [asyncio.Task(self.work(sem), loop=self.loop) for _ in range(self.max_tasks)]
        self.t0 = time.time()
        yield from self.q.join()
        self.t1 = time.time()
        for w in workers:
            w.cancel()
Exemplo n.º 15
0
class Crawl:
    def __init__(self, url, test_url, *, number=10, max_tasks=5):
        self.url = url
        self.test_url = test_url
        self.number = number
        self.max_tasks = max_tasks
        self.url_queue = Queue()
        self.raw_proxy_queue = Queue()
        self.session = aiohttp.ClientSession()  # tips: connection pool

    async def fetch_page(self, url):
        async with aiohttp.get(url) as response:
            try:
                assert response.status == 200
                print("OK!", response.url)
                return await response.text()
            except AssertionError:
                print('Error!', response.url, response.status)

    async def filter_page(self, url):
        page = await self.fetch_page(url)
        if page:
            pattern = re.compile(
                r'<tr>.*?<td>(.*?)</td>.*?<td>(.*?)</td>.*?<td>.*?</td>.*?<td>(.*?)</td>.*?<td>(.*?)</td>.*?<td>(.*?)</td>.*?<td>(.*?)</td>.*?</tr>',
                re.S)
            data = pattern.findall(page)
            print(len(data))
            for raw in data:
                item = list(map(lambda word: word.lower(), raw))
                await self.raw_proxy_queue.put({
                    'ip': item[0],
                    'port': item[1],
                    'anonymous': item[2],
                    'protocol': item[3],
                    'speed': item[4],
                    'checking-time': item[5]
                })
            if not self.raw_proxy_queue.empty():
                print('OK! raw_proxy_queue size: ',
                      self.raw_proxy_queue.qsize())

    async def verify_proxy(self, proxy):
        addr = proxy['protocol'] + '://' + proxy['ip'] + ':' + proxy['port']
        conn = aiohttp.ProxyConnector(proxy=addr)
        try:
            session = aiohttp.ClientSession(connector=conn)
            with aiohttp.Timeout(10):
                start = time.time()
                async with session.get(
                        self.test_url
                ) as response:  # close connection and response, otherwise will tip: Unclosed connection and Unclosed response
                    end = time.time()
                    try:
                        assert response.status == 200
                        print('Good proxy: {} {}s'.format(
                            proxy['ip'], end - start))
                    except:  # ProxyConnectionError, HttpProxyError and etc?
                        print('Bad proxy: {}, {}, {}s'.format(
                            proxy['ip'], response.status, end - start))
        except:
            print('timeout {}, q size: {}'.format(
                proxy['speed'], self.raw_proxy_queue.qsize()))
        finally:  # close session when timeout
            session.close()

    async def fetch_worker(self):
        while True:
            url = await self.url_queue.get()
            try:
                await self.filter_page(url)
            finally:
                self.url_queue.task_done()

    async def verify_worker(self):
        while True:
            raw_proxy = await self.raw_proxy_queue.get()
            if raw_proxy['protocol'] == 'https':  # only http can be used
                continue
            try:
                await self.verify_proxy(raw_proxy)
            finally:
                try:
                    self.raw_proxy_queue.task_done()
                except:
                    pass

    async def run(self):
        await asyncio.wait([
            self.url_queue.put(self.url + repr(i + 1))
            for i in range(self.number)
        ])
        fetch_tasks = [
            asyncio.ensure_future(self.fetch_worker())
            for _ in range(self.max_tasks)
        ]
        verify_tasks = [
            asyncio.ensure_future(self.verify_worker())
            for _ in range(10 * self.max_tasks)
        ]
        tasks = fetch_tasks + verify_tasks
        await self.url_queue.join()
        self.session.close()  # close session, otherwise shows error
        print("url_queue done")
        self.raw_proxy_queue.join()
        print("raw_proxy_queue done")
        await self.proxy_queue.join()
        for task in tasks:
            task.cancel()
Exemplo n.º 16
0
class Crawler(object):
    """Crawl a set of URLs.

    This manages two sets of URLs: 'urls' and 'done'.  'urls' is a set of
    URLs seen, and 'done' is a list of FetchStatistics.
    """
    def __init__(
            self,
            roots,
            scraper=None,
            data_handler=None,
            exclude=None,
            strict=True,  # What to crawl.
            max_redirect=5,
            max_tries=10,  # Per-url limits.
            max_tasks=10,
            max_connections_per_host=3,
            *,
            loop=None):
        self.loop = loop or asyncio.get_event_loop()
        self.roots = roots
        self.max_connections_per_host = max_connections_per_host
        self.scraper = scraper
        self.data_handler = data_handler
        self.exclude = exclude
        self.strict = strict
        self.max_redirect = max_redirect
        self.max_tries = max_tries
        self.max_tasks = max_tasks
        self.q = Queue(loop=self.loop)
        self.seen_urls = set()
        self.done = []
        self.session = aiohttp.ClientSession(loop=self.loop)
        self.root_domains = set()
        for root in roots:
            parts = urllib.parse.urlparse(root)
            host, port = urllib.parse.splitport(parts.netloc)
            if not host:
                continue
            if re.match(r'\A[\d\.]*\Z', host):
                self.root_domains.add(host)
            else:
                host = host.lower()
                if self.strict:
                    self.root_domains.add(host)
                else:
                    self.root_domains.add(lenient_host(host))
        for root in roots:
            self.add_urls(root)
        self.t0 = time.time()
        self.t1 = None

    def record_statistic(self,
                         url=None,
                         next_url=None,
                         status=None,
                         exception=None,
                         content_type=None,
                         encoding=None,
                         num_urls=0,
                         num_new_urls=0):
        """Record the FetchStatistic for completed / failed URL."""
        fetch_statistic = FetchStatistic(url=url,
                                         next_url=next_url,
                                         status=status,
                                         size=0,
                                         exception=exception,
                                         content_type=content_type,
                                         encoding=encoding,
                                         num_urls=num_urls,
                                         num_new_urls=num_new_urls)
        self.done.append(fetch_statistic)

    def extract_data(self, root_url, html):
        raise NotImplementedError('You need to define a extract_data method!')

    def close(self):
        """Close resources."""
        LOGGER.debug("closing resources")
        self.session.close()

    @asyncio.coroutine
    def parse_links(self, web_page_html, base_url, _content_type, _encoding):
        """Return a list of links."""
        links = set()
        tree = html.fromstring(web_page_html)
        tree.make_links_absolute(base_url)
        urls = [link[2] for link in tree.iterlinks()]
        for url in urls:
            defragmented, frag = urllib.parse.urldefrag(url)
            if verify.url_allowed(
                    defragmented, self.root_domains, exclude=self.exclude
            ):  # Select Valid links, testing against regexp and root_domains
                links.add(defragmented)
        if urls:
            LOGGER.info('got %r urls from %r new links: %i visited: %i',
                        len(urls), base_url, len(links - self.seen_urls),
                        len(self.seen_urls))
        new_links = [link for link in links.difference(self.seen_urls)]

        self.record_statistic(url=base_url,
                              content_type=_content_type,
                              encoding=_encoding,
                              num_urls=len(links),
                              num_new_urls=len(links - self.seen_urls))
        return new_links

    def handle_redirect(self, response, url, max_redirect):
        location = response.headers['location']
        next_url = urllib.parse.urljoin(url, location)
        self.record_statistic(url=url,
                              next_url=next_url,
                              status=response.status)
        if next_url in self.seen_urls:
            return
        if max_redirect > 0:
            LOGGER.info('redirect to %r from %r max_redir: %i', next_url, url,
                        max_redirect - 1)
            self.add_urls(next_url, max_redirect - 1)
        else:
            LOGGER.error('redirect limit reached for %r from %r', next_url,
                         url)
        return

    @asyncio.coroutine
    def fetch(self, url, max_redirect, sem):
        """Fetch one URL."""
        tries = 0
        web_page = None
        exception = None
        _url = None
        _encoding = None
        _content_type = None
        sleep_time = 0
        while tries < self.max_tries:
            try:
                with (yield from sem):
                    response = yield from asyncio.wait_for(self.session.get(
                        url, allow_redirects=False),
                                                           10,
                                                           loop=self.loop)
                if tries > 1:
                    LOGGER.debug('try %r for %r success', tries, url)
                break
            except Exception as client_error:
                sleep_time += 5
                yield from asyncio.sleep(sleep_time)
                LOGGER.error('try %r for %r raised %r', tries, url,
                             client_error)
                exception = client_error
            tries += 1
        else:
            # We never broke out of the loop: all tries failed.
            LOGGER.error('%r failed after %r tries', url, self.max_tries)
            self.record_statistic(url=url, exception=exception)
            return (web_page, _url, _content_type, _encoding)
        try:
            _url, _content_type, _encoding = get_content_type_and_encoding(
                response)
            if is_redirect(response):
                self.handle_redirect(response, url, max_redirect)
                web_page = 'redirect'
            elif response.status == 200 and _content_type in (
                    'text/html', 'application/xml'):
                web_page = yield from response.text()
            else:
                self.record_statistic(url=response.url,
                                      status=response.status,
                                      content_type=_content_type,
                                      encoding=_encoding)
        except Exception as e:
            print('*******error**********')
        finally:
            yield from response.release()
        return (web_page, _url, _content_type, _encoding)

    def add_urls(self, urls, max_redirect=None):
        """Add a URL to the queue if not seen before."""
        if max_redirect is None:
            max_redirect = self.max_redirect
        if not isinstance(urls, str):
            urls = set(urls)
            for link in urls.difference(self.seen_urls):
                self.q.put_nowait((link, max_redirect))
            self.seen_urls.update(urls)
        elif urls not in self.seen_urls:
            self.q.put_nowait((urls, max_redirect))
            self.seen_urls.add(urls)

    @asyncio.coroutine
    def work(self, sem):
        """Process queue items forever."""
        try:
            while True:
                url, max_redirect = yield from self.q.get()
                #assert url in self.seen_urls
                web_page, url, content_type, encoding = yield from self.fetch(
                    url, max_redirect, sem)
                if web_page and web_page != 'redirect':
                    new_links = yield from self.parse_links(
                        web_page, url, content_type, encoding)
                    if self.scraper:
                        data = self.scraper.scrape(url, web_page)
                    if self.data_handler:
                        self.data_handler.handle(data)
                    self.add_urls(new_links)
                self.q.task_done()
        except (asyncio.CancelledError, ):
            print('error')

    @asyncio.coroutine
    def crawl(self):
        sem = asyncio.Semaphore(value=self.max_connections_per_host,
                                loop=self.loop)
        """Run the crawler until all finished."""
        LOGGER.info('Starting crawl...')
        workers = [
            asyncio.Task(self.work(sem), loop=self.loop)
            for _ in range(self.max_tasks)
        ]
        self.t0 = time.time()
        yield from self.q.join()
        self.t1 = time.time()
        for w in workers:
            w.cancel()