Exemplo n.º 1
0
    def aggregate_batch(self, batch):
        """
        Given a particular batch, aggregate the stats from its children into
        the data model and return it
        """
        # find all the invalid sitescans and delete them
        # first, take out those with invalid domains
        requested_domains = [get_domain(s) for s in batch.sites]
        for sitescan in batch.sitescan_set.iterator():
            if not get_domain(sitescan.site_url) in requested_domains:
                sitescan.delete()
        # then take out those with no URLScans (a la mail.ru)
        for sitescan in batch.sitescan_set.iterator():
            if not sitescan.urlscan_set.count():
                sitescan.delete()

        sitescans = model.SiteScan.objects.filter(batch=batch).iterator()

        # Initialize counters
        total_rules = 0
        total_properties = 0
        total_pages_scanned = 0
        total_css_issues = 0
        total_ua_issues = 0

        # Aggregate data for each sitescan
        for sitescan in sitescans:
            sitescan_data = self.aggregate_sitescan(sitescan)
            total_rules += sitescan_data.num_rules
            total_properties += sitescan_data.num_properties
            total_pages_scanned += sitescan_data.scanned_pages
            total_css_issues += sitescan_data.css_issues
            total_ua_issues += 1 if sitescan_data.ua_issues else 0

        # Actually update the batchdata field
        data = model.BatchData.objects.create(
            batch=batch,
            num_rules=total_rules,
            num_properties=total_properties,
            scanned_pages=total_pages_scanned,
            css_issues=total_css_issues,
            ua_issues=total_ua_issues,
            )

        # Count and store regressions and fixes
        prev = DBUtils.get_previous_batch(batch)
        if prev and prev.data_aggregated:
            regressions, fixes = RegressionHunter.get_ua_diffs(prev, batch)
            data.ua_issues_regressed = len(regressions)
            data.ua_issues_fixed = len(fixes)
            regressions, fixes = RegressionHunter.get_css_diffs(prev, batch)
            data.css_issues_regressed = len(regressions)
            data.css_issues_fixed = len(fixes)
            data.save()

        # Mark the batch complete
        batch.data_aggregated = True
        batch.save()
Exemplo n.º 2
0
    def aggregate_batch(self, batch):
        """
        Given a particular batch, aggregate the stats from its children into
        the data model and return it
        """
        # find all the invalid sitescans and delete them
        # first, take out those with invalid domains
        requested_domains = [get_domain(s) for s in batch.sites]
        for sitescan in batch.sitescan_set.iterator():
            if not get_domain(sitescan.site_url) in requested_domains:
                sitescan.delete()
        # then take out those with no URLScans (a la mail.ru)
        for sitescan in batch.sitescan_set.iterator():
            if not sitescan.urlscan_set.count():
                sitescan.delete()

        sitescans = model.SiteScan.objects.filter(batch=batch).iterator()

        # Initialize counters
        total_rules = 0
        total_properties = 0
        total_pages_scanned = 0
        total_css_issues = 0
        total_ua_issues = 0

        # Aggregate data for each sitescan
        for sitescan in sitescans:
            sitescan_data = self.aggregate_sitescan(sitescan)
            total_rules += sitescan_data.num_rules
            total_properties += sitescan_data.num_properties
            total_pages_scanned += sitescan_data.scanned_pages
            total_css_issues += sitescan_data.css_issues
            total_ua_issues += 1 if sitescan_data.ua_issues else 0

        # Actually update the batchdata field
        data = model.BatchData.objects.create(
            batch=batch,
            num_rules=total_rules,
            num_properties=total_properties,
            scanned_pages=total_pages_scanned,
            css_issues=total_css_issues,
            ua_issues=total_ua_issues,
            )

        # Count and store regressions and fixes
        prev = DBUtils.get_previous_batch(batch)
        if prev and prev.data_aggregated:
            regressions, fixes = RegressionHunter.get_ua_diffs(prev, batch)
            data.ua_issues_regressed = len(regressions)
            data.ua_issues_fixed = len(fixes)
            regressions, fixes = RegressionHunter.get_css_diffs(prev, batch)
            data.css_issues_regressed = len(regressions)
            data.css_issues_fixed = len(fixes)
            data.save()

        # Mark the batch complete
        batch.data_aggregated = True
        batch.save()
Exemplo n.º 3
0
 def bad_sites(self):
     """ Returns a list of site urls that did not get scraped """
     sitelist = self.sites
     bad_sites = []
     for sitescan in self.sitescan_set.iterator():
         for site in sitelist:
             if get_domain(site) == get_domain(sitescan.site_url):
                 # if this domain is ok, take it out
                 sitelist.remove(site)
                 break
         else:
             bad_sites.append(site)
     # now sitelist contains only "bad sites", since the good ones were
     # removed in the loop
     bad_sites.extend(sitelist)
     return bad_sites
Exemplo n.º 4
0
 def bad_sites(self):
     """ Returns a list of site urls that did not get scraped """
     sitelist = self.sites
     bad_sites = []
     for sitescan in self.sitescan_set.iterator():
         for site in sitelist:
             if get_domain(site) == get_domain(sitescan.site_url):
                 # if this domain is ok, take it out
                 sitelist.remove(site)
                 break
         else:
             bad_sites.append(site)
     # now sitelist contains only "bad sites", since the good ones were
     # removed in the loop
     bad_sites.extend(sitelist)
     return bad_sites
Exemplo n.º 5
0
    def make_requests_from_url(self, url):
        """
        Generates one request per user_agent
        """
        sitescan, _ = model.SiteScan.objects.get_or_create(
            batch=self.batch, site_url_hash=sha256(get_domain(url)).hexdigest(), defaults={"site_url": url}
        )

        # Generate different UA requests for each UA
        for batch_user_agent in self.batch_user_agents:
            ua = batch_user_agent
            new_request = Request(url, dont_filter=True)
            new_request.headers.setdefault("User-Agent", ua.ua_string)
            new_request.meta["sitescan"] = sitescan
            new_request.meta["user_agent"] = ua
            self.log("Created request for {0} with ua {1}".format(url, ua.ua_string))
            yield new_request
Exemplo n.º 6
0
    def make_requests_from_url(self, url):
        """
        Generates one request per user_agent
        """
        sitescan, _ = model.SiteScan.objects.get_or_create(
            batch=self.batch,
            site_url_hash=sha256(get_domain(url)).hexdigest(),
            defaults={'site_url': url})

        # Generate different UA requests for each UA
        for batch_user_agent in self.batch_user_agents:
            ua = batch_user_agent
            new_request = Request(url, dont_filter=True)
            new_request.headers.setdefault('User-Agent', ua.ua_string)
            new_request.meta['sitescan'] = sitescan
            new_request.meta['user_agent'] = ua
            self.log("Created request for {0} with ua {1}".format(
                url, ua.ua_string))
            yield new_request
Exemplo n.º 7
0
    def parse(self, response):
        """
        Function called by the scrapy downloader after a site url has been
        visited
        """
        content_type = self.get_content_type(response.headers)

        sitescan = response.meta.get('sitescan')
        if sitescan is None:
            # This sitescan needs to be created
            sitescan, ss_created = model.SiteScan.objects.get_or_create(
                batch=self.batch,
                site_url_hash=sha256(get_domain(response.url)).hexdigest(),
                defaults={'site_url': response.url})

            if not ss_created:
                # Duplicate URL in the text file, ignore this site
                return

        if response.meta.get('user_agent') is None:
            # Generate different UA requests for each UA
            for batch_user_agent in self.batch_user_agents:
                ua = batch_user_agent

                # Generate new request
                new_request = Request(response.url)
                new_request.headers.setdefault('User-Agent', ua.ua_string)
                new_request.meta['referrer'] = response.meta.get('referrer')
                new_request.meta['sitescan'] = sitescan
                new_request.meta['user_agent'] = ua
                new_request.meta['content_type'] = content_type

                yield new_request

        else:
            if 'text/html' not in self.get_content_type(response.headers):
                # For linked content, find the urlscan it linked from
                urlscan = model.URLScan.objects.get(
                    site_scan=sitescan,
                    page_url_hash=sha256(
                        response.meta['referrer']).hexdigest())
            else:
                # Only create urlscans for text/html
                urlscan, us_created = model.URLScan.objects.get_or_create(
                    site_scan=sitescan,
                    page_url_hash=sha256(response.url).hexdigest(),
                    defaults={
                        'page_url': response.url,
                        'timestamp': self.get_now_time()
                    })

                # Continue crawling
                # Parse stylesheet links, scripts, and hyperlinks
                hxs = HtmlXPathSelector(response)

                # Extract other target links
                try:
                    css_links = hxs.select('//link/@href').extract()
                except TypeError:
                    css_links = []

                try:
                    js_links = hxs.select('//script/@src').extract()
                except TypeError:
                    js_links = []

                try:
                    hyperlinks = hxs.select('//a/@href').extract()
                except TypeError:
                    hyperlinks = []

                # Using a set removes duplicate links.
                all_links = set(hyperlinks + js_links + css_links)

                # Examine links, yield requests if they are valid
                for url in all_links:

                    if not url.startswith('http://'):
                        # ensure that links are to real sites
                        if url.startswith('javascript:'):
                            continue
                        else:
                            url = urljoin(response.url, url)

                    ua = response.meta['user_agent']

                    request = Request(url)
                    request.headers.setdefault('User-Agent', ua.ua_string)
                    request.meta['referrer'] = response.url
                    request.meta['sitescan'] = sitescan
                    request.meta['user_agent'] = ua
                    request.meta['content_type'] = None

                    yield request

            # The response contains a user agent, we should yield an item
            item = MarkupItem()
            item['content_type'] = self.get_content_type(response.headers)
            item['filename'] = os.path.basename(urlparse(response.url).path)
            item['headers'] = unicode(response.headers)
            item['meta'] = response.meta
            item['raw_content'] = response.body
            item['sitescan'] = sitescan
            item['urlscan'] = urlscan
            item['url'] = response.url
            item['user_agent'] = response.meta.get('user_agent')
            item['redirected_from'] = response.meta.get('redirected_from', u'')
            yield item
Exemplo n.º 8
0
    def parse(self, response):
        """
        Function called by the scrapy downloader after a site url has been
        visited
        """
        content_type = self.get_content_type(response.headers)

        sitescan = response.meta.get('sitescan')
        if sitescan is None:
            # This sitescan needs to be created
            sitescan, ss_created = model.SiteScan.objects.get_or_create(

                batch=self.batch,
                site_url_hash=sha256(get_domain(response.url)).hexdigest(),
                defaults={'site_url': response.url})

            if not ss_created:
                # Duplicate URL in the text file, ignore this site
                return

        if response.meta.get('user_agent') is None:
            # Generate different UA requests for each UA
            for batch_user_agent in self.batch_user_agents:
                ua = batch_user_agent

                # Generate new request
                new_request = Request(response.url)
                new_request.headers.setdefault('User-Agent', ua.ua_string)
                new_request.meta['referrer'] = response.meta.get('referrer')
                new_request.meta['sitescan'] = sitescan
                new_request.meta['user_agent'] = ua
                new_request.meta['content_type'] = content_type

                yield new_request


        else:
            if 'text/html' not in self.get_content_type(response.headers):
                # For linked content, find the urlscan it linked from
                urlscan = model.URLScan.objects.get(

                    site_scan=sitescan,
                    page_url_hash=
                    sha256(response.meta['referrer']).hexdigest())
            else:
                # Only create urlscans for text/html
                urlscan, us_created = model.URLScan.objects.get_or_create(

                    site_scan=sitescan,
                    page_url_hash=sha256(response.url).hexdigest(),
                    defaults={'page_url': response.url,
                              'timestamp': self.get_now_time()})

                # Continue crawling
                # Parse stylesheet links, scripts, and hyperlinks
                hxs = HtmlXPathSelector(response)

                # Extract other target links
                try:
                    css_links = hxs.select('//link/@href').extract()
                except TypeError:
                    css_links = []

                try:
                    js_links = hxs.select('//script/@src').extract()
                except TypeError:
                    js_links = []

                try:
                    hyperlinks = hxs.select('//a/@href').extract()
                except TypeError:
                    hyperlinks = []

                # Using a set removes duplicate links.
                all_links = set(hyperlinks + js_links + css_links)

                # Examine links, yield requests if they are valid
                for url in all_links:

                    if not url.startswith('http://'):
                        # ensure that links are to real sites
                        if url.startswith('javascript:'):
                            continue
                        else:
                            url = urljoin(response.url, url)

                    ua = response.meta['user_agent']

                    request = Request(url)
                    request.headers.setdefault('User-Agent', ua.ua_string)
                    request.meta['referrer'] = response.url
                    request.meta['sitescan'] = sitescan
                    request.meta['user_agent'] = ua
                    request.meta['content_type'] = None

                    yield request


            # The response contains a user agent, we should yield an item
            item = MarkupItem()
            item['content_type'] = self.get_content_type(response.headers)
            item['filename'] = os.path.basename(urlparse(response.url).path)
            item['headers'] = unicode(response.headers)
            item['meta'] = response.meta
            item['raw_content'] = response.body
            item['sitescan'] = sitescan
            item['urlscan'] = urlscan
            item['url'] = response.url
            item['user_agent'] = response.meta.get('user_agent')
            item['redirected_from'] = response.meta.get('redirected_from',
                                                        u'')
            yield item