Пример #1
0
    def parse(self, response):
        self._logger.debug("crawled url {}".format(response.request.url))
        cur_depth = 0
        if 'curdepth' in response.meta:
            cur_depth = response.meta['curdepth']

        # capture raw response
        item = RawResponseItem()
        # populated from response.meta
        item['appid'] = response.meta['appid']
        item['crawlid'] = response.meta['crawlid']
        item['attrs'] = response.meta['attrs']

        # populated from raw HTTP response
        item["url"] = response.request.url
        item["response_url"] = response.url
        item["status_code"] = response.status
        item["status_msg"] = "OK"
        item["response_headers"] = self.reconstruct_headers(response)
        item["request_headers"] = response.request.headers
        # item["body"] = response.body
        item["body"] = "asdfsdfsdfsdfsdfsdf"
        item["links"] = []
        hxs = Selector(response)
        cityList = hxs.xpath('//div[@class="header-city-province-text"]/a')
        for city in cityList:
            link = city.xpath('@href').extract()[0]
            cityName = city.xpath("text()").extract()[0]
            yield Request(url=link,
                          meta={"city": cityName},
                          callback=self.parse_url,
                          dont_filter=True)
Пример #2
0
    def parse(self, response):
        self._logger.debug("crawled url {}".format(response.request.url))
        cur_depth = 0
        if 'curdepth' in response.meta:
            cur_depth = response.meta['curdepth']

        # capture raw response
        item = RawResponseItem()
        # populated from response.meta
        item['appid'] = response.meta['appid']
        item['crawlid'] = response.meta['crawlid']
        item['attrs'] = response.meta['attrs']

        # populated from raw HTTP response
        item["url"] = response.request.url
        item["response_url"] = response.url
        item["status_code"] = response.status
        item["status_msg"] = "OK"
        item["response_headers"] = self.reconstruct_headers(response)
        item["request_headers"] = response.request.headers
        item["body"] = response.body
        item["links"] = []

        # determine whether to continue spidering
        if cur_depth >= response.meta['maxdepth']:
            self._logger.debug("Not spidering links in '{}' because" \
                " cur_depth={} >= maxdepth={}".format(
                                                      response.url,
                                                      cur_depth,
                                                      response.meta['maxdepth']))
        else:
            # we are spidering -- yield Request for each discovered link
            link_extractor = LinkExtractor(
                            allow_domains=response.meta['allowed_domains'],
                            allow=response.meta['allow_regex'],
                            deny=response.meta['deny_regex'],
                            deny_extensions=response.meta['deny_extensions'])

            for link in link_extractor.extract_links(response):
                # link that was discovered
                the_url = link.url
                the_url = the_url.replace('\n', '')
                item["links"].append({"url": the_url, "text": link.text, })
                req = Request(the_url, callback=self.parse)

                req.meta['priority'] = response.meta['priority'] - 10
                req.meta['curdepth'] = response.meta['curdepth'] + 1

                if 'useragent' in response.meta and \
                        response.meta['useragent'] is not None:
                    req.headers['User-Agent'] = response.meta['useragent']

                self._logger.debug("Trying to follow link '{}'".format(req.url))
                yield req

        # raw response has been processed, yield to item pipeline
        yield item
Пример #3
0
    def _get_item(self):
        item = RawResponseItem()
        item['appid'] = 'app'
        item['crawlid'] = 'crawlid'
        item['attrs'] = {}
        item["url"] = "http://dumb.com"
        item["response_url"] = "http://dumb.com"
        item["status_code"] = 200
        item["status_msg"] = "OK"
        item["response_headers"] = {}
        item["request_headers"] = {}
        item["body"] = "text"
        item["links"] = []

        return item
Пример #4
0
    def _get_internationalized_utf8_item(self):
        item = RawResponseItem()
        item['appid'] = 'app'
        item['crawlid'] = 'crawlid'
        item['attrs'] = {}
        item["url"] = "http://dumb.com"
        item["response_url"] = "http://dumb.com"
        item["status_code"] = 200
        item["status_msg"] = "OK"
        item["response_headers"] = {}
        item["request_headers"] = {}
        item[
            "body"] = u"This is a test - Αυτό είναι ένα τεστ - 这是一个测试 - これはテストです"
        item["links"] = []
        item["encoding"] = "utf-8"

        return item
Пример #5
0
    def _get_internationalized_iso_item(self):
        item = RawResponseItem()
        item['appid'] = 'app'
        item['crawlid'] = 'crawlid'
        item['attrs'] = {}
        item["url"] = "http://dumb.com"
        item["response_url"] = "http://dumb.com"
        item["status_code"] = 200
        item["status_msg"] = "OK"
        item["response_headers"] = {}
        item["request_headers"] = {}
        # Fill the item["body"] with the string 'αυτό είναι ένα τεστ' that was encoded in iso-8859-7
        # using iconv and further encoded in base64 in order to store it inside this file.
        item["body"] = base64.b64decode('4fX0/CDl3+3h6SDd7eEg9OXz9Ao=')
        item["links"] = []
        item["encoding"] = "iso-8859-7"

        return item
Пример #6
0
    def parse(self, response):
        self._logger.debug("crawled url {}".format(response.request.url))
        # capture raw response
        item = RawResponseItem()
        # populated from response.meta
        item['appid'] = response.meta['appid']
        item['crawlid'] = response.meta['crawlid']
        item['attrs'] = response.meta['attrs']

        # populated from raw HTTP response
        item["url"] = response.request.url
        item["response_url"] = response.url
        item["status_code"] = response.status
        item["response_headers"] = self.reconstruct_headers(response)
        item["request_headers"] = response.request.headers

        # raw response has been processed, yield to item pipeline
        self._logger.debug("Created Item successfully")
        yield item
Пример #7
0
    def parse(self, response):
        self._logger.debug("crawled url {}".format(response.request.url))
        cur_depth = 0
        if 'curdepth' in response.meta:
            cur_depth = response.meta['curdepth']

        # capture raw response
        item = RawResponseItem()
        # populated from response.meta
        item['appid'] = response.meta['appid']
        item['crawlid'] = response.meta['crawlid']
        item['attrs'] = response.meta['attrs']

        # populated from raw HTTP response
        item["url"] = response.request.url
        item["response_url"] = response.url
        item["status_code"] = response.status
        item["status_msg"] = "OK"
        item["response_headers"] = self.reconstruct_headers(response)
        item["request_headers"] = response.request.headers
        item["body"] = response.body
        # item["body"] = "asdfsdfsdfsdfsdfsdf"
        item["links"] = []

        soup = BeautifulSoup(response.body.decode("utf-8"), "lxml")
        citys = soup.find("div", class_="area-city-letter").find_all("a")
        for city in citys:
            cityspell = city['href']
            cityname = city.get_text(strip=True)
            link = self.root_url + cityspell + "ershouche/"
            req = Request(link,
                          meta={"city": cityname},
                          callback=self.parse_list,
                          dont_filter=True)
            if 'useragent' in response.meta and response.meta[
                    'useragent'] is not None:
                req.headers['User-Agent'] = response.meta['useragent']
            yield req
            self.logger.info(req)
        yield item
Пример #8
0
    def parse(self, response):
        # debug output for receiving the url
        self._logger.debug("crawled url {}".format(response.request.url))

        # step counter for how many pages we have hit
        step = 0
        if 'step' in response.meta:
            step = response.meta['step']

        # Create Item to send to kafka
        # capture raw response
        item = RawResponseItem()
        # populated from response.meta
        item['appid'] = response.meta['appid']
        item['crawlid'] = response.meta['crawlid']
        item['attrs'] = response.meta['attrs']
        # populated from raw HTTP response
        item["url"] = response.request.url
        item["response_url"] = response.url
        item["status_code"] = response.status
        item["status_msg"] = "OK"
        item["response_headers"] = self.reconstruct_headers(response)
        item["request_headers"] = response.request.headers
        item["body"] = response.body
        item["links"] = []
        # we want to know how far our spider gets
        if item['attrs'] is None:
            item['attrs'] = {}

        item['attrs']['step'] = step

        self._logger.debug("Finished creating item")

        # determine what link we want to crawl
        link_extractor = LinkExtractor(
            allow_domains=response.meta['allowed_domains'],
            allow=response.meta['allow_regex'],
            deny=response.meta['deny_regex'],
            deny_extensions=response.meta['deny_extensions'])

        links = link_extractor.extract_links(response)

        # there are links on the page
        if len(links) > 0:
            self._logger.debug("Attempting to find links")
            link = random.choice(links)
            req = Request(link.url, callback=self.parse)

            # increment our step counter for this crawl job
            req.meta['step'] = step + 1

            # pass along our user agent as well
            if 'useragent' in response.meta and \
                        response.meta['useragent'] is not None:
                req.headers['User-Agent'] = response.meta['useragent']

            # debug output
            self._logger.debug("Trying to yield link '{}'".format(req.url))

            # yield the Request to the scheduler
            yield req
        else:
            self._logger.info("Did not find any more links")

        # raw response has been processed, yield to item pipeline
        yield item
Пример #9
0
    def parse(self, response):
        self._logger.debug("crawled url {}".format(response.request.url))
        self._increment_status_code_stat(response)

        if 'curdepth' in response.meta:
            cur_depth = response.meta['curdepth']

        # capture raw response
        item = RawResponseItem()
        # populated from response.meta
        item['appid'] = response.meta['appid']
        item['crawlid'] = response.meta['crawlid']
        item['attrs'] = response.meta['attrs']

        # populated from raw HTTP response
        item["url"] = response.request.url
        item["response_url"] = response.url
        item["status_code"] = response.status
        item["status_msg"] = "OK"
        item["response_headers"] = self.reconstruct_headers(response)
        item["request_headers"] = response.request.headers
        item["body"] = response.body
        item["links"] = []
        if isinstance(response, (SplashResponse, SplashTextResponse)):
            if "png" in response.data:
                print " @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ "
                print " @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ "
                print " @@@@@@@@@@@@@@@@@@@@ image @@@@@@@@@@@@@@@@@@@@@@ "
                print " @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ "
                print " @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ "
                item["image"] = response.data['png']

        # login
#        if response.url == 'http://fyqe73pativ7vdif.onion/login/':
#        if response.url == 'http://mt3plrzdiyqf6jim.onion/renewal/login.php':
        if response.url in response.meta['login'] and response.status == 200:
            _id = response.meta['login'][response.url]['loginid']
            _pass = response.meta['login'][response.url]['password']

            #            print response.body

            #            data, url, method = fill_login_form(response.url, response.body, 'w-_-w', '1234567890')
            #            data, url, method = fill_login_form(response.url, response.body, '0x0', '1234567890')
            data, url, method = fill_login_form(response.url, response.body,
                                                _id, _pass)
            yield FormRequest(url,
                              formdata=dict(data),
                              method=method,
                              callback=self.parse,
                              meta=make_splash_meta(response.meta))
        else:
            cur_depth = 0
            # determine whether to continue spidering
            if response.meta['maxdepth'] != -1 and cur_depth >= response.meta[
                    'maxdepth']:
                self._logger.debug("Not spidering links in '{}' because" \
                    " cur_depth={} >= maxdepth={}".format(
                                                          response.url,
                                                          cur_depth,
                                                          response.meta['maxdepth']))
            else:
                # we are spidering -- yield Request for each discovered link
                link_extractor = LinkExtractor(
                    deny_domains=response.meta['denied_domains'],
                    allow_domains=response.meta['allowed_domains'],
                    allow=response.meta['allow_regex'],
                    deny=response.meta['deny_regex'],
                    deny_extensions=response.meta['deny_extensions'])

                for link in link_extractor.extract_links(response):
                    # link that was discovered
                    item["links"].append({
                        "url": link.url,
                        "text": link.text,
                    })
                    req = Request(link.url,
                                  callback=self.parse,
                                  meta=make_splash_meta({}))

                    # pass along all known meta fields
                    for key in response.meta.keys():
                        if key != 'splash' and key != 'request':
                            req.meta[key] = response.meta[key]
                    if '_splash_processed' in req.meta:
                        req.meta.pop("_splash_processed")

                    req.meta['priority'] = response.meta['priority'] - 10
                    req.meta['curdepth'] = response.meta['curdepth'] + 1

                    if 'useragent' in response.meta and \
                            response.meta['useragent'] is not None:
                        req.headers['User-Agent'] = response.meta['useragent']

                    self._logger.debug("Trying to follow link '{}'".format(
                        req.url))
                    yield req

        # raw response has been processed, yield to item pipeline
        yield item
    def parse(self, response):
        # Check url at start of parse to catch links that were potentially redirected.
        orig_domain = response.url
        if "orig_domain" in response.meta:
            orig_domain = response.meta["orig_domain"]
        else:
            response.meta["orig_domain"] = orig_domain
        if not self.validate_link(response.url, orig_domain):
            return

        self._logger.debug("starting parse on url {}".format(
            response.request.url))
        cur_depth = 0
        if 'curdepth' in response.meta:
            cur_depth = response.meta['curdepth']
        else:
            response.meta['curdepth'] = cur_depth
        self._logger.debug("Forming response object")
        # capture raw response
        item = RawResponseItem()
        # populated from response.meta
        item['appid'] = response.meta['appid']
        item['crawlid'] = response.meta['crawlid']
        item['attrs'] = response.meta['attrs']

        # populated from raw HTTP response
        item["url"] = response.request.url
        item["response_url"] = response.url
        item["status_code"] = response.status
        item["status_msg"] = "OK"
        item["response_headers"] = self.reconstruct_headers(response)
        item["request_headers"] = response.request.headers
        item["links"] = []
        item["curdepth"] = str(cur_depth)

        is_pdf = False
        url = response.url.lower()
        if (url[len(url) - 4:] == '.pdf') or ('.pdf?' in url):
            is_pdf = True

        item["is_pdf"] = str(is_pdf)
        if is_pdf:
            self._logger.debug("Handling pdf file")
            self.download_file(response.url)
            item["body"] = self.pdfparser("temp_document.pdf")
        else:
            item["body"] = self.gather_text(response.body)
            self._logger.debug("Current depth: " + str(cur_depth))
            # determine whether to continue spidering
            if cur_depth >= response.meta['maxdepth']:
                self._logger.debug("Not spidering links in '{}' because" \
                    " cur_depth={} >= maxdepth={}".format(
                                                          response.url,
                                                          cur_depth,
                                                          response.meta['maxdepth']))
            else:
                # we are spidering -- yield Request for each discovered link
                link_extractor = LinkExtractor(
                    allow_domains=response.meta['allowed_domains'],
                    allow=response.meta['allow_regex'],
                    deny=response.meta['deny_regex'],
                    deny_extensions=response.meta['deny_extensions'])

                for link in link_extractor.extract_links(response):
                    # link that was discovered
                    the_url = link.url
                    the_url = the_url.replace('\n', '')
                    if not self.validate_link(the_url, orig_domain):
                        continue
                    item["links"].append(
                        str({
                            "url": the_url,
                            "text": link.text,
                        }))
                    req = Request(the_url, callback=self.parse)

                    req.meta['priority'] = response.meta['priority'] - 10
                    req.meta['curdepth'] = response.meta['curdepth'] + 1

                    if 'useragent' in response.meta and \
                            response.meta['useragent'] is not None:
                        req.headers['User-Agent'] = response.meta['useragent']

                    self._logger.debug("Trying to follow link '{}'".format(
                        req.url))
                    yield req

        # raw response has been processed, yield to item pipeline
        yield item
Пример #11
0
    def parse(self, response):
        print "crawled url", response.request.url
        cur_depth = 0
        if 'curdepth' in response.meta:
            cur_depth = response.meta['curdepth']

        # capture raw response
        item = RawResponseItem()

        # populated from response.meta
        item['appid'] = response.meta['appid']
        item['crawlid'] = response.meta['crawlid']
        item['attrs'] = response.meta['attrs']

        # populated from raw HTTP response
        item["url"] = response.request.url
        item["response_url"] = response.url
        item["status_code"] = response.status
        item["status_msg"] = "OK"

        item["headers"] = self.reconstruct_headers(response)
        item["body"] = response.body
        item["links"] = []

        # determine whether to continue spidering
        if cur_depth >= response.meta['maxdepth']:
            self.log("Not spidering links in '{}' because" \
                " cur_depth={} >= maxdepth={}".format(
                response.url,
                cur_depth,
                response.meta['maxdepth']), level=INFO)
        else:
            # we are spidering -- yield Request for each discovered link
            link_extractor = LinkExtractor(
                allow_domains=response.meta['allowed_domains'],
                allow=response.meta['allow_regex'],
                deny=response.meta['deny_regex'],
                deny_extensions=response.meta['deny_extensions'])
            for link in link_extractor.extract_links(response):
                # link that was discovered
                item["links"].append({
                    "url": link.url,
                    "text": link.text,
                })

                req = Request(
                    link.url,
                    callback=self.parse,
                    meta={
                        "allowed_domains": response.meta['allowed_domains'],
                        "allow_regex": response.meta['allow_regex'],
                        "deny_regex": response.meta['deny_regex'],
                        "deny_extensions": response.meta['deny_extensions'],
                        "maxdepth": response.meta['maxdepth'],
                        "curdepth": cur_depth + 1,
                        "appid": response.meta['appid'],
                        "crawlid": response.meta['crawlid'],
                        "attrs": response.meta['attrs'],
                        "spiderid": self.name,
                        "expires": response.meta['expires'],
                        "priority": response.meta['priority'] - 10,
                    },
                )

                self.log("Trying to follow link '{}'".format(req.url),
                         level=INFO)
                yield req

        # raw response has been processed, yield to item pipeline
        yield item
Пример #12
0
    def parse(self, response):
        self._logger.debug("crawling url {}".format(response.request.url))
        is_pdf = False
        url = response.url.lower()
        if (url[len(url) - 4:] == '.pdf') or ('.pdf?' in url):
            self._logger.debug("Found a pdf file, not making a Link object")
            is_pdf = True
        else:
            if "link" not in response.meta:
                link = Link(response.url)
            else:
                link = Link(response.meta["link"])

            self._logger.debug("made the link object")
            self._logger.debug("Link created is of type " + link.type)

        cur_depth = 0
        if 'curdepth' in response.meta:
            cur_depth = response.meta['curdepth']

        # capture raw response
        item = RawResponseItem()
        # populated from response.meta
        item['appid'] = response.meta['appid']
        item['crawlid'] = response.meta['crawlid']
        item['attrs'] = response.meta['attrs']

        # populated from raw HTTP response
        item["url"] = response.request.url
        item["response_url"] = response.url
        item["status_code"] = response.status
        item["status_msg"] = "OK"
        item["response_headers"] = self.reconstruct_headers(response)
        item["request_headers"] = response.request.headers
        item["body"] = ""
        item["links"] = []

        item["is_pdf"] = str(is_pdf)

        # if response.meta["maxdepth"] == 0:
        # response.meta["maxdepth"] = maxint

        # if "maxdepth" in response.meta and cur_depth >= response.meta['maxdepth']:
        #     self._logger.debug("Not spidering links in '{}' because" \
        #                        " cur_depth={} >= maxdepth={}".format(
        #         response.url,
        #         cur_depth,
        #         response.meta['maxdepth']))
        # else:
        try:
            # self._logger.debug("Current max depth is " + str(response.meta['maxdepth']))
            if is_pdf:
                self._logger.debug("Downloading pdf file")
                self.download_file(response.url)

                self._logger.debug("Parsing pdf file")
                item["body"] = self.pdfparser("temp_document.pdf")
                self._logger.debug("Finished handling pdf file")

            else:
                self._logger.debug("About to click on link " +
                                   link.hrefAttribute)
                new_links = link.click_and_yield(self.driver)
                self._logger.debug("Just finished clicking link")
                item["body"] = link.text
                for l in new_links:
                    item["links"].append(l.hrefAttribute)
                    request = scrapy.Request(l.hrefAttribute,
                                             callback=self.parse)
                    request.meta["link"] = l
                    request.meta['priority'] = max(
                        response.meta['priority'] - 10, 0)
                    request.meta['curdepth'] = response.meta['curdepth'] + 1
                    if 'useragent' in response.meta and \
                                    response.meta['useragent'] is not None:
                        request.headers['User-Agent'] = response.meta[
                            'useragent']

                    self._logger.debug(
                        "Making a requeset object for this newly found link: '{}'"
                        .format(request.url))
                    yield request
        except LinkException:
            self._logger.debug("Could not click link:" + str(link))

        # raw response has been processed, yield to item pipeline
        self._logger.debug("Yielding item")
        yield item