Пример #1
0
 def enqueue_seeds_from_file(self, file_path):
     with open(file_path, 'r') as urls_file:
         for line in urls_file:
             url_info = UrlInfo(line.strip())
             url_info.is_seed = True
             url_info.discovered = time.time()
             self.enqueue(url_info)
Пример #2
0
 def enqueue_skipped(self):
     # Todo: Not tested
     crawl_job = CrawlJob(self.crawl_job_name)
     scope = CrawlerScope(
         ConfigFetcher(self.crawl_job_name).get_scope_file())
     frontier = get_frontier_client(self.ec2_instance, None)
     for item in crawl_job.skipped_urls.select("select * from %s" %
                                               crawl_job.skipped_urls.name):
         url_info = UrlInfo(item.name)
         if scope.get(url_info) == UrlClass.InScope:
             url_info.referrer_id = item[REFERRER_ID_ATTR_NAME]
             frontier.enqueue(url_info)
             item.delete()  # Todo: do this in batches?
Пример #3
0
def test_get_original():
    fetcher = Fetcher('siemens20150201')

    url_info = UrlInfo(
        'http://' +
        'www.siemens.com.ph/_framework/slt/widget/tabbedpanels/sprytabbedpanels.css'
    )
    # url_info.fingerprint = '7fd8d4e6e43f454d9b60c9d7e246d3ac8bc6468b'
    url_info.fingerprint = '227e04c77d1476708086800b7a4da8cc6f997fea'

    orig = fetcher.get_original(url_info)

    print orig.name
Пример #4
0
def unpack_sdb_object(m, fields):
    u = UrlInfo(m['raw_url'], m['url'])

    for field, data_type in fields.iteritems():
        if field in m:
            if data_type is unicode:
                u.__dict__[field] = m[field]
            elif data_type is float:
                u.__dict__[field] = float(m[field])
            elif data_type is list:
                u.__dict__[field] = json.loads(m[field])
            elif data_type is set:
                u.__dict__[field] = set(json.loads(m[field]))
            elif data_type is dict:
                d = {}

                for name, value in m.iteritems():
                    if name in fields.keys():
                        continue
                    d[name] = value
                u.__dict__[field] = d
            elif data_type is datetime:
                u.__dict__[field] = dateutil.parser.parse(m[field])
            else:
                raise IllegalDataType(data_type)

    return u
Пример #5
0
    def extract_urls(self, url_info, content):
        try:
            locations = []
            if url_info.url.endswith('txt'):
                locations = content.splitlines()
            elif url_info.url.endswith('xml'):
                tree = etree.parse(StringIO(content))
                if tree.docinfo.root_name == 'sitemapindex':
                    locations = tree.xpath('//*[name()="loc"]/text()')
                elif tree.docinfo.root_name == 'urlset':
                    locations = tree.xpath(
                        '//*[substring(name(), string-length(name())-2)="loc"]/text()'
                    )

            urls = set()
            for loc in [text.strip() for text in locations]:
                if loc.startswith('http'):
                    try:
                        urls.add(UrlInfo(loc))
                    except Exception, ex:
                        if type(ex) is InvalidUrl:
                            exc_info = None
                        else:
                            exc_info = sys.exc_info()
                        self.logger.log(LogType.InternalWarning,
                                        "Failed to extract URL from: %s" % loc,
                                        url_info.url, ex, exc_info)
        except XMLSyntaxError, ex:
            urls = []
Пример #6
0
    def download_robots_txt(self, domain):
        robots_txt = RobotsTxtParser(
        )  # is_allowed() always returns True for an empty parser.
        url_id = domain + '/robots.txt'
        m = self.crawl_job.crawled_urls.get_item(url_id)
        url_info = unpack_sdb_url_info(m) if m else UrlInfo("http://" + url_id)

        try:
            if not url_info.discovered:
                url_info.discovered = time.time()
            robots_dot_txt, from_cache = self.download_resource(
                url_info, self.crawl_job)
            if robots_dot_txt:
                robots_txt = RobotsTxtParser(
                    robots_dot_txt
                    if self.global_config.strict_robots_txt_parsing else
                    robots_dot_txt.lower(), url_info.expires)
                if not from_cache:
                    url_info.original = ORIGINAL_ATTR_VALUE_SELF
                    self.crawl_job.crawled_content.put(url_info,
                                                       robots_dot_txt)
                    self.save_crawled_url(url_info)
                self.extract_sitemaps(robots_txt.sitemaps, domain)
        except (requests.HTTPError, IncompleteRead, requests.ConnectionError,
                requests.Timeout, requests.RequestException,
                dns.exception.DNSException), ex:
            # The default RobotsTxtParser object will cause the fetcher to check again in RobotsTxtParser.MIN_TTL
            self.logger.log(LogType.ExternalWarning,
                            'Unable to download robots.txt', url_info.url, ex)
Пример #7
0
def sqs_batch_test():
    urls = [
        UrlInfo(u'http://www.selinc.com.mx/pdf/papers/TP-2005-001 Aplicación de Relevadores Multifuncionales.pdf'),
        UrlInfo(u'http://www.selinc.com.mx/pdf/papers/')
    ]
    batch = []

    for i in xrange(0, len(urls)):
        sent_body = pack_message(urls[i])
        # reminder: the body is sent not a message object. Bad: sent_msg = SqsMessage(body=sent_body)
        batch.append((i, sent_body, 0))

    queue.write_batch(batch)

    time.sleep(2)

    dequeue()
Пример #8
0
    def extract_sitemaps(self, sitemaps, domain):
        # Add the sitemap URLs to the frontier
        sitemap_url_infos = set()
        # Get sitemap.html, sitemap.htm, sitemap.php, sitemap.aspx, sitemap.asp, sitemap.jsp, sitemap.txt, etc?
        # Just to make sure that this one gets handled.
        sitemaps.append(create_absolute_url('sitemap.xml', 'http://' + domain))

        for sitemap_url in sitemaps:
            url_info = UrlInfo(sitemap_url)
            # this only works because the _content_type backing field is pickled.
            url_info._content_type = 'sitemap'
            url_info.discovered = time.time()
            if not self.url_is_seen(url_info):
                sitemap_url_infos.add(url_info)

        if sitemap_url_infos:
            self.add_to_frontier(sitemap_url_infos)
Пример #9
0
def purge_out_of_scope(self, crawl_job_name):
    """
    Go through crawled_urls and move them into skipped_urls and delete the content from s3.
    """
    # Todo: Not tested
    scope = CrawlerScope(ConfigFetcher(crawl_job_name).get_scope_file())
    crawl_job = CrawlJob(crawl_job_name)

    next_token_file_path = os.path.join(
        LOCAL_CRAWL_JOB_DIR, self.name,
        "purge_out_of_scope_next_crawled_urls_token.txt")

    with open(next_token_file_path, 'r') as next_token_file:
        prev_next_token = next_token_file.read()

    query = "select `url`, `referrer_id` from `{0}`".format(
        crawl_job.glossary.crawled_urls_table_name)
    try:
        items = crawl_job.crawled_urls.select(query,
                                              next_token=prev_next_token)
    except:
        items = crawl_job.crawled_urls.select(query)

    next_token = items.next_token

    count = 0
    try:
        for item in items:
            count += 1

            if prev_next_token != items.next_token:
                prev_next_token = next_token
                next_token = items.next_token

            url_info = UrlInfo(item['url'], canonized_url=item['url'])
            c = scope.get(url_info)
            if c == UrlClass.InScope or item.name.endswith(
                    'robots.txt') or item.name.endswith('sitemap.xml'):
                continue

            attributes = {REASON_ATTR_NAME: c}
            referrer_id = item.get(REFERRER_ID_ATTR_NAME, None)
            if referrer_id:
                attributes[REFERRER_ID_ATTR_NAME] = referrer_id
            crawl_job.skipped_urls.put_attributes(item.name, attributes)
            key = crawl_job.crawled_content_bucket.get_key(url_info.s3_key)

            if key:
                key.delete()
            item.delete()  # Todo: do this in batches?
    except Exception, ex:
        with open(next_token_file_path, 'w') as next_token_file:
            next_token_file.write(prev_next_token)
        print "Interrupted after %s records." % count
        raise
Пример #10
0
def sqs_test():
    url = u'http://www.selinc.com.mx/pdf/papers/TP-2005-001 Aplicación de Relevadores Multifuncionales.pdf'
    # url = u'http://www.selinc.com.mx/pdf/papers/'
    sent_url_info = UrlInfo(url)
    sent_body = pack_message(sent_url_info)
    sent_msg = SqsMessage(body=sent_body)
    queue.write(sent_msg)

    time.sleep(2)

    dequeue()
Пример #11
0
def test_seen_urls():
    fetcher = Fetcher('sel09292014')

    url_info = UrlInfo('selinc.lojablindada.com/customer/account/login/')

    # url_info = UrlInfo('http://www.yadayadayada.com/2')

    # fetcher.global_seen_urls.add(url_info.id, url_info.sld)
    # fetcher.mark_url_seen(url_info)

    # url_info = UrlInfo('http://www.yadayadayada.com/2')

    print fetcher.url_is_seen(url_info)
Пример #12
0
    def test_put(self):
        url_info = UrlInfo('http://www.example.com/bogus.html')
        self._target.put(url_info, self._content)

        m = self._crawled_urls.get_item('www2.selinc.com/robots.txt')
        url_info = unpack_sdb_url_info(m)
        self._target.put(url_info, self._content)

        m = self._crawled_urls.get_item('lojavirtual.selinc.com.br/')
        url_info = unpack_sdb_url_info(m)
        self._target.put(url_info, self._content)

        m = self._crawled_urls.get_item(
            'www.selinc.com/assets/0/114/234/236/f9895377-e729-4242-bf6e-2cf76fdcdf58.pdf#page%3D2'
        )
        url_info = unpack_sdb_url_info(m)
        self._target.put(url_info, self._content)
Пример #13
0
    def find_all_urls_with_attributes(self, attributes, soup, urls, base, url_info):
        for attribute in attributes:
            for element in soup.find_all(attrs={attribute: True}):
                try:
                    value = element.get(attribute)
                    if value == '':
                        continue
                    lcase_value = value.lower()
                    if 'javascript:' in lcase_value or 'mailto:' in lcase_value:
                        continue

                    new_url_info = UrlInfo(create_absolute_url(value, base=base))
                    if new_url_info == url_info:
                        continue
                    urls.add(new_url_info)
                except Exception, ex:
                    self.log_extract_failure(ex, url_info, element)
Пример #14
0
def export_seeds_from_crawl_job(output_path, dest_crawl_job_name,
                                src_crawl_job_name, version):
    crawl_job = crawl_job_versions[version](src_crawl_job_name)
    scope = CrawlerScope(ConfigFetcher(dest_crawl_job_name).get_scope_file())
    query = "select `url` from `{0}` where `url` is not null and `redirectsTo` is null".format(
        crawl_job.crawled_urls.name)

    with open(output_path, 'w') as output_file:
        items = crawl_job.crawled_urls.select(query)

        count = 0
        try:
            for item in items:
                url = item['url']
                count += 1
                if scope.get(UrlInfo(url)) == UrlClass.InScope:
                    output_file.write(url + '\n')
        except Exception as ex:
            print "Interrupted after %s records" % count
            raise
Пример #15
0
    def extract_urls(self, url_info, contents):
        try:
            content_type, charset = url_info.content_type
            unicode_contents = contents.decode(charset) if charset else contents.decode('utf-8')
        except UnicodeDecodeError:
            unicode_contents = contents.decode("iso-8859-1")  # try this.
        soup = BeautifulSoup(unicode_contents)

        # set the base URL for the page used for creating absolute URLs
        base = soup.base.get('href') if soup.base else url_info.url

        urls = set()

        for element in soup.find_all('object', {'data': True}):
            try:
                data = element.get('data')
                if data.startswith('http'):
                    new_url_info = UrlInfo(create_absolute_url(data, base=base))
                    if new_url_info:
                        urls.add(new_url_info)
            except Exception, ex:
                self.log_extract_failure(ex, url_info, element)
Пример #16
0
 def extract_urls(self, url_info, contents):
     urls = set()
     css_unicode, encoding = tinycss.decoding.decode(contents)
     stylesheet = self.css_parser.parse_stylesheet(css_unicode, encoding=encoding)
     for rule in stylesheet.rules:
         if not isinstance(rule, RuleSet):
             continue
         for declaration in rule.declarations:
             for token in declaration.value:
                 try:
                     if token.type != 'URI' or not token.value:
                         continue
                     new_url_info = UrlInfo(create_absolute_url(token.value, base=url_info.url))
                     if new_url_info == url_info:
                         continue
                     urls.add(new_url_info)  # removes duplicates
                 except Exception, ex:
                     if type(ex) is InvalidUrl:
                         stack_trace = None
                     else:
                         stack_trace = sys.exc_info()
                     self.logger.log(LogType.InternalWarning, "Failed to extract URL from: %s" % token.value,
                                     url_info.url, ex, stack_trace)
Пример #17
0
    def test_add_contains(self):
        target = SeenUrls('base_key')

        target.add(UrlInfo('http://www.example.com'))
        self.assertIn(UrlInfo('http://www.example.com/'), target)
        self.assertIn(UrlInfo('http://www.example.com'), target)
        target.add(UrlInfo('http://www.example.com/'))

        self.assertNotIn(UrlInfo('http://www.example.com/one'), target)

        target.add(UrlInfo('http://www.example.com/one/two'))
        target.add(UrlInfo('http://www.foo.com'))
        target.add(UrlInfo('http://www.foo.com/'))
        target.add(UrlInfo('http://www.foo.com/one/two'))

        self.assertIn(UrlInfo('http://www.example.com/one/two'), target)
        self.assertIn(UrlInfo('http://www.foo.com/one/two'), target)
        self.assertIn(UrlInfo('http://www.foo.com'), target)
        self.assertIn(UrlInfo('http://www.foo.com/'), target)
Пример #18
0
from atrax.common.crawl_scope import CrawlerScope
from atrax.common.url_info import UrlInfo
from python_common.simple_list_file import SimpleListFile

target = CrawlerScope(SimpleListFile('/usr/local/crawl_jobs/siemens23012015/siemens23012015.scope'))

actual = target.get(UrlInfo(u'http://www.douban.com/recommend/?title=Traditional%20Chinese%20medicine&url=http://w1.siemens.com.cn/sustainable-city-en/sustainable-city.html%23tcm-healthcare'))

i = 9
Пример #19
0
    def download_resource(self, url_info, reference_job=None):
        if reference_job and url_info.use_cached:
            content = reference_job.crawled_content.get(url_info.s3_key)
            if content is not None:
                return content, True

        fetched = sec_since_epoch_to_http_date(
            url_info.fetched) if url_info.fetched else None
        response = self.download(url_info, fetched=fetched, etag=url_info.etag)

        if response.status_code == 304:
            content = reference_job.crawled_content.get(url_info.s3_key)
            if content is not None:
                return content, True
            response = self.download(url_info)

        url_info.http_status = response.status_code
        url_info.response_headers = response.headers
        url_info.fetcher = self.id
        url_info.fetched = time.time()

        response.raise_for_status()

        if response.status_code in [300, 301, 302, 303, 307, 308]:
            if url_info.num_redirects >= MAX_REDIRECTS:
                raise requests.TooManyRedirects('Exceeded %s redirects.' %
                                                MAX_REDIRECTS)

            new_raw_url = urljoin(url_info.raw_url,
                                  response.headers['location'])
            if url_info.raw_url == new_raw_url:  # URL redirects back to itself
                return None, None

            new_url_info = UrlInfo(new_raw_url)
            self.url_transformer.transform(new_url_info)

            if url_info.id == new_url_info.id:
                # Only the scheme changed (usually http -> https) so just fetch that URL right away instead
                url_info.change_scheme(extract_scheme(new_raw_url))
                return self.download_resource(url_info, reference_job)

            new_url_info.referrer_id = url_info.id
            new_url_info.num_redirects = url_info.num_redirects + 1
            new_url_info.discovered = time.time()

            url_info.redirects_to = new_url_info.id
            self.save_redirected_url(url_info)  # Log the redirect

            # Calling self.process_url() on the new URL can cause unbounded recursion so return None here. Redirects
            # from robots.txt files are usually useless anyway. It is either a domain level redirect or an error page.
            if url_info.url.endswith('/robots.txt'):
                return None, None

            if self.process_url(new_url_info):
                self.mark_url_seen(
                    new_url_info
                )  # local only because new_url_info isn't added to the frontier.
                self.add_to_frontier([new_url_info])

            # The body of non-300 type responses is usually empty.
            if response.status_code != 300:
                return None, None

        return response.content, False
Пример #20
0
 def dequeue(self, fetcher_id):
     line = self.file.readline()
     if line:
         return self.cur, UrlInfo(line)
     else:
         return None, None