def execute(self, task): """Execute the task and return the result. """ url = task.url encoded_url = str(url) path_encoded = url.path.encode(self._encoding) encoded_url = '%s%s/' % (encoded_url[:-len(path_encoded)], urllib.quote(path_encoded.rstrip('/'))) opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Arachne/%s' % __version__)] try: handler = opener.open(encoded_url) data = handler.read() # Everything seems to be OK, add entries to the result. result = CrawlResult(task, True) for match in self._ENTRIES_RE.finditer(data): entry_data = {} entry_data['is_dir'] = (match.group(1).lower() == 'dir') entry_name = self._ENTITIES_RE.sub(self._sub_entity, match.group(2)) result.add_entry(entry_name, entry_data) handler.close() self._results.put(result) self._tasks.report_done(task) except urllib2.HTTPError, error: if error.code == 404: # The directory does not exists. Generate a not-found result # because the entire directory tree should be removed from the # index. result = CrawlResult(task, False) self._results.put(result) self._tasks.report_done(task) else: self._tasks.report_error_dir(task) logging.error('Error visiting "%s" (%s: %s)' % (url, error.code, error.msg))
class TestCrawlResult(unittest.TestCase): def setUp(self): url = URL('ftp://deltha.uh.cu/') site_id = 'aa958756e769188be9f76fbdb291fe1b2ddd4777' self._num_entries = 10 self._found = True self._task = CrawlTask(site_id, url) self._entries = [(str(i), {'is_dir': i < (self._num_entries / 2)}) for i in range(self._num_entries)] self._result = CrawlResult(self._task, self._found) def test_properties(self): self.assertEquals(self._result.task.site_id, self._task.site_id) self.assertEquals(str(self._result.task.url), str(self._task.url)) self.assertEquals(self._result.found, self._found) def test_add_entry_and_iter(self): for entry, data in self._entries: self._result.add_entry(entry, data) entries = map(lambda i: i[0], self._entries) for entry, data in self._result: entries.remove(entry) self.assertEquals(len(entries), 0) def test_contains(self): entry, data = self._entries[0] self._result.add_entry(entry, data) self.assertTrue(entry in self._result) self.assertFalse(entry * 2 in self._result) def test_len(self): for entry, data in self._entries: self._result.add_entry(entry, data) self.assertEquals(len(self._result), self._num_entries) def test_getitem(self): entry, data = self._entries[0] self._result.add_entry(entry, data) self.assertEquals(self._result[entry], data) self.assertRaises(KeyError, self._result.__getitem__, entry * 2) def test_pickling(self): for entry, data in self._entries: self._result.add_entry(entry, data) result = pickle.loads(pickle.dumps(self._result)) self.assertEquals(self._result.task.site_id, result.task.site_id) self.assertEquals(str(self._result.task.url), str(result.task.url)) self.assertEquals(self._result.found, result.found) entries = map(lambda i: i[0], self._entries) for entry, data in result: entries.remove(entry) self.assertEquals(len(entries), 0)
def execute(self, task): """Execute the task and return the result. """ url = task.url try: ftp = ftplib.FTP() if url.port: ftp.connect(url.hostname.encode(self._encoding), url.port) else: ftp.connect(url.hostname.encode(self._encoding)) if url.username: ftp.login(url.username.encode(self._encoding), url.password.encode(self._encoding)) else: ftp.login() try: ftp.cwd(url.path.encode(self._encoding)) except ftplib.error_perm: # Failed to change directory. result = CrawlResult(task, False) else: # It seems to be a valid directory. result = CrawlResult(task, True) entries = [] callback = lambda line: entries.append(self._parse_list(line)) ftp.retrlines('LIST', callback) for entry_name, is_dir in (entry for entry in entries if entry is not None): data = {} if is_dir is not None: data['is_dir'] = is_dir else: # The parser does not known if this entry is a # directory or not. Try to change directory, if error, # assume it is a file. try: entry_url = url.join(entry_name) ftp.cwd(entry_url.path.encode(self._encoding)) except ftplib.error_perm: data['is_dir'] = False else: data['is_dir'] = True if not data['is_dir']: content = self._get_content( str(task.url.join(entry_name))) if content: data['content'] = content result.add_entry(entry_name, data) ftp.quit() except socket.timeout, error: self._tasks.report_error_site(task) logging.error('Error visiting "%s" (%s)' % (url, error))
def execute(self, task): """Execute the task and return the result. """ url = task.url try: ftp = ftplib.FTP() if url.port: ftp.connect(url.hostname.encode(self._encoding), url.port) else: ftp.connect(url.hostname.encode(self._encoding)) if url.username: ftp.login(url.username.encode(self._encoding), url.password.encode(self._encoding)) else: ftp.login() try: ftp.cwd(url.path.encode(self._encoding)) except ftplib.error_perm: # Failed to change directory. result = CrawlResult(task, False) else: # It seems to be a valid directory. result = CrawlResult(task, True) entries = [] callback = lambda line: entries.append(self._parse_list(line)) ftp.retrlines('LIST', callback) for entry_name, is_dir in (entry for entry in entries if entry is not None): data = {} if is_dir is not None: data['is_dir'] = is_dir else: # The parser does not known if this entry is a # directory or not. Try to change directory, if error, # assume it is a file. try: entry_url = url.join(entry_name) ftp.cwd(entry_url.path.encode(self._encoding)) except ftplib.error_perm: data['is_dir'] = False else: data['is_dir'] = True if not data['is_dir']: content = self._get_content(str(task.url.join(entry_name))) if content: data['content'] = content result.add_entry(entry_name, data) ftp.quit() except socket.timeout, error: self._tasks.report_error_site(task) logging.error('Error visiting "%s" (%s)' % (url, error))
def execute(self, task): """Execute the task and return the result. """ url = task.url try: if os.path.isdir(url.path): result = CrawlResult(task, True) for entry_name in os.listdir(url.path): data = {} entry_url = url.join(entry_name) data['is_dir'] = os.path.isdir(entry_url.path) result.add_entry(entry_url.basename, data) else: result = CrawlResult(task, False) except OSError, error: if error.errno in self._errnos_dir: self._tasks.report_error_dir(task) else: self._tasks.report_error_site(task) logging.error('Error visiting "%s" (%s)' % (url, error.strerror))