def enqueue_seeds_from_file(self, file_path): with open(file_path, 'r') as urls_file: for line in urls_file: url_info = UrlInfo(line.strip()) url_info.is_seed = True url_info.discovered = time.time() self.enqueue(url_info)
def enqueue_skipped(self): # Todo: Not tested crawl_job = CrawlJob(self.crawl_job_name) scope = CrawlerScope( ConfigFetcher(self.crawl_job_name).get_scope_file()) frontier = get_frontier_client(self.ec2_instance, None) for item in crawl_job.skipped_urls.select("select * from %s" % crawl_job.skipped_urls.name): url_info = UrlInfo(item.name) if scope.get(url_info) == UrlClass.InScope: url_info.referrer_id = item[REFERRER_ID_ATTR_NAME] frontier.enqueue(url_info) item.delete() # Todo: do this in batches?
def test_get_original(): fetcher = Fetcher('siemens20150201') url_info = UrlInfo( 'http://' + 'www.siemens.com.ph/_framework/slt/widget/tabbedpanels/sprytabbedpanels.css' ) # url_info.fingerprint = '7fd8d4e6e43f454d9b60c9d7e246d3ac8bc6468b' url_info.fingerprint = '227e04c77d1476708086800b7a4da8cc6f997fea' orig = fetcher.get_original(url_info) print orig.name
def unpack_sdb_object(m, fields): u = UrlInfo(m['raw_url'], m['url']) for field, data_type in fields.iteritems(): if field in m: if data_type is unicode: u.__dict__[field] = m[field] elif data_type is float: u.__dict__[field] = float(m[field]) elif data_type is list: u.__dict__[field] = json.loads(m[field]) elif data_type is set: u.__dict__[field] = set(json.loads(m[field])) elif data_type is dict: d = {} for name, value in m.iteritems(): if name in fields.keys(): continue d[name] = value u.__dict__[field] = d elif data_type is datetime: u.__dict__[field] = dateutil.parser.parse(m[field]) else: raise IllegalDataType(data_type) return u
def extract_urls(self, url_info, content): try: locations = [] if url_info.url.endswith('txt'): locations = content.splitlines() elif url_info.url.endswith('xml'): tree = etree.parse(StringIO(content)) if tree.docinfo.root_name == 'sitemapindex': locations = tree.xpath('//*[name()="loc"]/text()') elif tree.docinfo.root_name == 'urlset': locations = tree.xpath( '//*[substring(name(), string-length(name())-2)="loc"]/text()' ) urls = set() for loc in [text.strip() for text in locations]: if loc.startswith('http'): try: urls.add(UrlInfo(loc)) except Exception, ex: if type(ex) is InvalidUrl: exc_info = None else: exc_info = sys.exc_info() self.logger.log(LogType.InternalWarning, "Failed to extract URL from: %s" % loc, url_info.url, ex, exc_info) except XMLSyntaxError, ex: urls = []
def download_robots_txt(self, domain): robots_txt = RobotsTxtParser( ) # is_allowed() always returns True for an empty parser. url_id = domain + '/robots.txt' m = self.crawl_job.crawled_urls.get_item(url_id) url_info = unpack_sdb_url_info(m) if m else UrlInfo("http://" + url_id) try: if not url_info.discovered: url_info.discovered = time.time() robots_dot_txt, from_cache = self.download_resource( url_info, self.crawl_job) if robots_dot_txt: robots_txt = RobotsTxtParser( robots_dot_txt if self.global_config.strict_robots_txt_parsing else robots_dot_txt.lower(), url_info.expires) if not from_cache: url_info.original = ORIGINAL_ATTR_VALUE_SELF self.crawl_job.crawled_content.put(url_info, robots_dot_txt) self.save_crawled_url(url_info) self.extract_sitemaps(robots_txt.sitemaps, domain) except (requests.HTTPError, IncompleteRead, requests.ConnectionError, requests.Timeout, requests.RequestException, dns.exception.DNSException), ex: # The default RobotsTxtParser object will cause the fetcher to check again in RobotsTxtParser.MIN_TTL self.logger.log(LogType.ExternalWarning, 'Unable to download robots.txt', url_info.url, ex)
def sqs_batch_test(): urls = [ UrlInfo(u'http://www.selinc.com.mx/pdf/papers/TP-2005-001 Aplicación de Relevadores Multifuncionales.pdf'), UrlInfo(u'http://www.selinc.com.mx/pdf/papers/') ] batch = [] for i in xrange(0, len(urls)): sent_body = pack_message(urls[i]) # reminder: the body is sent not a message object. Bad: sent_msg = SqsMessage(body=sent_body) batch.append((i, sent_body, 0)) queue.write_batch(batch) time.sleep(2) dequeue()
def extract_sitemaps(self, sitemaps, domain): # Add the sitemap URLs to the frontier sitemap_url_infos = set() # Get sitemap.html, sitemap.htm, sitemap.php, sitemap.aspx, sitemap.asp, sitemap.jsp, sitemap.txt, etc? # Just to make sure that this one gets handled. sitemaps.append(create_absolute_url('sitemap.xml', 'http://' + domain)) for sitemap_url in sitemaps: url_info = UrlInfo(sitemap_url) # this only works because the _content_type backing field is pickled. url_info._content_type = 'sitemap' url_info.discovered = time.time() if not self.url_is_seen(url_info): sitemap_url_infos.add(url_info) if sitemap_url_infos: self.add_to_frontier(sitemap_url_infos)
def purge_out_of_scope(self, crawl_job_name): """ Go through crawled_urls and move them into skipped_urls and delete the content from s3. """ # Todo: Not tested scope = CrawlerScope(ConfigFetcher(crawl_job_name).get_scope_file()) crawl_job = CrawlJob(crawl_job_name) next_token_file_path = os.path.join( LOCAL_CRAWL_JOB_DIR, self.name, "purge_out_of_scope_next_crawled_urls_token.txt") with open(next_token_file_path, 'r') as next_token_file: prev_next_token = next_token_file.read() query = "select `url`, `referrer_id` from `{0}`".format( crawl_job.glossary.crawled_urls_table_name) try: items = crawl_job.crawled_urls.select(query, next_token=prev_next_token) except: items = crawl_job.crawled_urls.select(query) next_token = items.next_token count = 0 try: for item in items: count += 1 if prev_next_token != items.next_token: prev_next_token = next_token next_token = items.next_token url_info = UrlInfo(item['url'], canonized_url=item['url']) c = scope.get(url_info) if c == UrlClass.InScope or item.name.endswith( 'robots.txt') or item.name.endswith('sitemap.xml'): continue attributes = {REASON_ATTR_NAME: c} referrer_id = item.get(REFERRER_ID_ATTR_NAME, None) if referrer_id: attributes[REFERRER_ID_ATTR_NAME] = referrer_id crawl_job.skipped_urls.put_attributes(item.name, attributes) key = crawl_job.crawled_content_bucket.get_key(url_info.s3_key) if key: key.delete() item.delete() # Todo: do this in batches? except Exception, ex: with open(next_token_file_path, 'w') as next_token_file: next_token_file.write(prev_next_token) print "Interrupted after %s records." % count raise
def sqs_test(): url = u'http://www.selinc.com.mx/pdf/papers/TP-2005-001 Aplicación de Relevadores Multifuncionales.pdf' # url = u'http://www.selinc.com.mx/pdf/papers/' sent_url_info = UrlInfo(url) sent_body = pack_message(sent_url_info) sent_msg = SqsMessage(body=sent_body) queue.write(sent_msg) time.sleep(2) dequeue()
def test_seen_urls(): fetcher = Fetcher('sel09292014') url_info = UrlInfo('selinc.lojablindada.com/customer/account/login/') # url_info = UrlInfo('http://www.yadayadayada.com/2') # fetcher.global_seen_urls.add(url_info.id, url_info.sld) # fetcher.mark_url_seen(url_info) # url_info = UrlInfo('http://www.yadayadayada.com/2') print fetcher.url_is_seen(url_info)
def test_put(self): url_info = UrlInfo('http://www.example.com/bogus.html') self._target.put(url_info, self._content) m = self._crawled_urls.get_item('www2.selinc.com/robots.txt') url_info = unpack_sdb_url_info(m) self._target.put(url_info, self._content) m = self._crawled_urls.get_item('lojavirtual.selinc.com.br/') url_info = unpack_sdb_url_info(m) self._target.put(url_info, self._content) m = self._crawled_urls.get_item( 'www.selinc.com/assets/0/114/234/236/f9895377-e729-4242-bf6e-2cf76fdcdf58.pdf#page%3D2' ) url_info = unpack_sdb_url_info(m) self._target.put(url_info, self._content)
def find_all_urls_with_attributes(self, attributes, soup, urls, base, url_info): for attribute in attributes: for element in soup.find_all(attrs={attribute: True}): try: value = element.get(attribute) if value == '': continue lcase_value = value.lower() if 'javascript:' in lcase_value or 'mailto:' in lcase_value: continue new_url_info = UrlInfo(create_absolute_url(value, base=base)) if new_url_info == url_info: continue urls.add(new_url_info) except Exception, ex: self.log_extract_failure(ex, url_info, element)
def export_seeds_from_crawl_job(output_path, dest_crawl_job_name, src_crawl_job_name, version): crawl_job = crawl_job_versions[version](src_crawl_job_name) scope = CrawlerScope(ConfigFetcher(dest_crawl_job_name).get_scope_file()) query = "select `url` from `{0}` where `url` is not null and `redirectsTo` is null".format( crawl_job.crawled_urls.name) with open(output_path, 'w') as output_file: items = crawl_job.crawled_urls.select(query) count = 0 try: for item in items: url = item['url'] count += 1 if scope.get(UrlInfo(url)) == UrlClass.InScope: output_file.write(url + '\n') except Exception as ex: print "Interrupted after %s records" % count raise
def extract_urls(self, url_info, contents): try: content_type, charset = url_info.content_type unicode_contents = contents.decode(charset) if charset else contents.decode('utf-8') except UnicodeDecodeError: unicode_contents = contents.decode("iso-8859-1") # try this. soup = BeautifulSoup(unicode_contents) # set the base URL for the page used for creating absolute URLs base = soup.base.get('href') if soup.base else url_info.url urls = set() for element in soup.find_all('object', {'data': True}): try: data = element.get('data') if data.startswith('http'): new_url_info = UrlInfo(create_absolute_url(data, base=base)) if new_url_info: urls.add(new_url_info) except Exception, ex: self.log_extract_failure(ex, url_info, element)
def extract_urls(self, url_info, contents): urls = set() css_unicode, encoding = tinycss.decoding.decode(contents) stylesheet = self.css_parser.parse_stylesheet(css_unicode, encoding=encoding) for rule in stylesheet.rules: if not isinstance(rule, RuleSet): continue for declaration in rule.declarations: for token in declaration.value: try: if token.type != 'URI' or not token.value: continue new_url_info = UrlInfo(create_absolute_url(token.value, base=url_info.url)) if new_url_info == url_info: continue urls.add(new_url_info) # removes duplicates except Exception, ex: if type(ex) is InvalidUrl: stack_trace = None else: stack_trace = sys.exc_info() self.logger.log(LogType.InternalWarning, "Failed to extract URL from: %s" % token.value, url_info.url, ex, stack_trace)
def test_add_contains(self): target = SeenUrls('base_key') target.add(UrlInfo('http://www.example.com')) self.assertIn(UrlInfo('http://www.example.com/'), target) self.assertIn(UrlInfo('http://www.example.com'), target) target.add(UrlInfo('http://www.example.com/')) self.assertNotIn(UrlInfo('http://www.example.com/one'), target) target.add(UrlInfo('http://www.example.com/one/two')) target.add(UrlInfo('http://www.foo.com')) target.add(UrlInfo('http://www.foo.com/')) target.add(UrlInfo('http://www.foo.com/one/two')) self.assertIn(UrlInfo('http://www.example.com/one/two'), target) self.assertIn(UrlInfo('http://www.foo.com/one/two'), target) self.assertIn(UrlInfo('http://www.foo.com'), target) self.assertIn(UrlInfo('http://www.foo.com/'), target)
from atrax.common.crawl_scope import CrawlerScope from atrax.common.url_info import UrlInfo from python_common.simple_list_file import SimpleListFile target = CrawlerScope(SimpleListFile('/usr/local/crawl_jobs/siemens23012015/siemens23012015.scope')) actual = target.get(UrlInfo(u'http://www.douban.com/recommend/?title=Traditional%20Chinese%20medicine&url=http://w1.siemens.com.cn/sustainable-city-en/sustainable-city.html%23tcm-healthcare')) i = 9
def download_resource(self, url_info, reference_job=None): if reference_job and url_info.use_cached: content = reference_job.crawled_content.get(url_info.s3_key) if content is not None: return content, True fetched = sec_since_epoch_to_http_date( url_info.fetched) if url_info.fetched else None response = self.download(url_info, fetched=fetched, etag=url_info.etag) if response.status_code == 304: content = reference_job.crawled_content.get(url_info.s3_key) if content is not None: return content, True response = self.download(url_info) url_info.http_status = response.status_code url_info.response_headers = response.headers url_info.fetcher = self.id url_info.fetched = time.time() response.raise_for_status() if response.status_code in [300, 301, 302, 303, 307, 308]: if url_info.num_redirects >= MAX_REDIRECTS: raise requests.TooManyRedirects('Exceeded %s redirects.' % MAX_REDIRECTS) new_raw_url = urljoin(url_info.raw_url, response.headers['location']) if url_info.raw_url == new_raw_url: # URL redirects back to itself return None, None new_url_info = UrlInfo(new_raw_url) self.url_transformer.transform(new_url_info) if url_info.id == new_url_info.id: # Only the scheme changed (usually http -> https) so just fetch that URL right away instead url_info.change_scheme(extract_scheme(new_raw_url)) return self.download_resource(url_info, reference_job) new_url_info.referrer_id = url_info.id new_url_info.num_redirects = url_info.num_redirects + 1 new_url_info.discovered = time.time() url_info.redirects_to = new_url_info.id self.save_redirected_url(url_info) # Log the redirect # Calling self.process_url() on the new URL can cause unbounded recursion so return None here. Redirects # from robots.txt files are usually useless anyway. It is either a domain level redirect or an error page. if url_info.url.endswith('/robots.txt'): return None, None if self.process_url(new_url_info): self.mark_url_seen( new_url_info ) # local only because new_url_info isn't added to the frontier. self.add_to_frontier([new_url_info]) # The body of non-300 type responses is usually empty. if response.status_code != 300: return None, None return response.content, False
def dequeue(self, fetcher_id): line = self.file.readline() if line: return self.cur, UrlInfo(line) else: return None, None