Exemplo n.º 1
0
 def execute(self, task):
     """Execute the task and return the result.
     """
     url = task.url
     encoded_url = str(url)
     path_encoded = url.path.encode(self._encoding)
     encoded_url = '%s%s/' % (encoded_url[:-len(path_encoded)],
                              urllib.quote(path_encoded.rstrip('/')))
     opener = urllib2.build_opener()
     opener.addheaders = [('User-agent', 'Arachne/%s' % __version__)]
     try:
         handler = opener.open(encoded_url)
         data = handler.read()
         # Everything seems to be OK, add entries to the result.
         result = CrawlResult(task, True)
         for match in self._ENTRIES_RE.finditer(data):
             entry_data = {}
             entry_data['is_dir'] = (match.group(1).lower() == 'dir')
             entry_name = self._ENTITIES_RE.sub(self._sub_entity, match.group(2))
             result.add_entry(entry_name, entry_data)
         handler.close()
         self._results.put(result)
         self._tasks.report_done(task)
     except urllib2.HTTPError, error:
         if error.code == 404:
             # The directory does not exists. Generate a not-found result
             # because the entire directory tree should be removed from the
             # index.
             result = CrawlResult(task, False)
             self._results.put(result)
             self._tasks.report_done(task)
         else:
             self._tasks.report_error_dir(task)
             logging.error('Error visiting "%s" (%s: %s)' % (url, error.code, error.msg))
Exemplo n.º 2
0
 def setUp(self):
     url = URL('ftp://deltha.uh.cu/')
     site_id = 'aa958756e769188be9f76fbdb291fe1b2ddd4777'
     self._num_entries = 10
     self._found = True
     self._task = CrawlTask(site_id, url)
     self._entries = [(str(i), {'is_dir': i < (self._num_entries / 2)})
                      for i in range(self._num_entries)]
     self._result = CrawlResult(self._task, self._found)
Exemplo n.º 3
0
class TestCrawlResult(unittest.TestCase):

    def setUp(self):
        url = URL('ftp://deltha.uh.cu/')
        site_id = 'aa958756e769188be9f76fbdb291fe1b2ddd4777'
        self._num_entries = 10
        self._found = True
        self._task = CrawlTask(site_id, url)
        self._entries = [(str(i), {'is_dir': i < (self._num_entries / 2)})
                         for i in range(self._num_entries)]
        self._result = CrawlResult(self._task, self._found)

    def test_properties(self):
        self.assertEquals(self._result.task.site_id, self._task.site_id)
        self.assertEquals(str(self._result.task.url), str(self._task.url))
        self.assertEquals(self._result.found, self._found)

    def test_add_entry_and_iter(self):
        for entry, data in self._entries:
            self._result.add_entry(entry, data)
        entries = map(lambda i: i[0], self._entries)
        for entry, data in self._result:
            entries.remove(entry)
        self.assertEquals(len(entries), 0)

    def test_contains(self):
        entry, data = self._entries[0]
        self._result.add_entry(entry, data)
        self.assertTrue(entry in self._result)
        self.assertFalse(entry * 2 in self._result)

    def test_len(self):
        for entry, data in self._entries:
            self._result.add_entry(entry, data)
        self.assertEquals(len(self._result), self._num_entries)

    def test_getitem(self):
        entry, data = self._entries[0]
        self._result.add_entry(entry, data)
        self.assertEquals(self._result[entry], data)
        self.assertRaises(KeyError, self._result.__getitem__, entry * 2)

    def test_pickling(self):
        for entry, data in self._entries:
            self._result.add_entry(entry, data)
        result = pickle.loads(pickle.dumps(self._result))
        self.assertEquals(self._result.task.site_id, result.task.site_id)
        self.assertEquals(str(self._result.task.url), str(result.task.url))
        self.assertEquals(self._result.found, result.found)
        entries = map(lambda i: i[0], self._entries)
        for entry, data in result:
            entries.remove(entry)
        self.assertEquals(len(entries), 0)
Exemplo n.º 4
0
 def execute(self, task):
     """Execute the task and return the result.
     """
     url = task.url
     try:
         ftp = ftplib.FTP()
         if url.port:
             ftp.connect(url.hostname.encode(self._encoding), url.port)
         else:
             ftp.connect(url.hostname.encode(self._encoding))
         if url.username:
             ftp.login(url.username.encode(self._encoding),
                       url.password.encode(self._encoding))
         else:
             ftp.login()
         try:
             ftp.cwd(url.path.encode(self._encoding))
         except ftplib.error_perm:
             # Failed to change directory.
             result = CrawlResult(task, False)
         else:
             # It seems to be a valid directory.
             result = CrawlResult(task, True)
             entries = []
             callback = lambda line: entries.append(self._parse_list(line))
             ftp.retrlines('LIST', callback)
             for entry_name, is_dir in (entry for entry in entries
                                        if entry is not None):
                 data = {}
                 if is_dir is not None:
                     data['is_dir'] = is_dir
                 else:
                     # The parser does not known if this entry is a
                     # directory or not.  Try to change directory, if error,
                     # assume it is a file.
                     try:
                         entry_url = url.join(entry_name)
                         ftp.cwd(entry_url.path.encode(self._encoding))
                     except ftplib.error_perm:
                         data['is_dir'] = False
                     else:
                         data['is_dir'] = True
                 if not data['is_dir']:
                     content = self._get_content(str(task.url.join(entry_name)))
                     if content:
                         data['content'] = content
                 result.add_entry(entry_name, data)
         ftp.quit()
     except socket.timeout, error:
         self._tasks.report_error_site(task)
         logging.error('Error visiting "%s" (%s)' % (url, error))
Exemplo n.º 5
0
 def setUp(self):
     self._db_home = os.path.join(TESTDIR, 'testresultqueue')
     os.mkdir(self._db_home)
     self._sites_info = {
         'a78e6853355ad5cdc751ad678d15339382f9ed21':
             {'url': URL('ftp://atlantis.uh.cu/')},
         '7e019d6f671d336a0cc31f137ba034efb13fc327':
             {'url': URL('ftp://andromeda.uh.cu/')},
         'aa958756e769188be9f76fbdb291fe1b2ddd4777':
             {'url': URL('ftp://deltha.uh.cu/')},
         'd4af25db08f5fb6e768db027d51b207cd1a7f5d0':
             {'url': URL('ftp://anduin.uh.cu/')},
         '886b46f54bcd45d4dd5732e290c60e9639b0d101':
             {'url': URL('ftp://tigris.uh.cu/')},
         'ee5b017839d97507bf059ec91f1e5644a30b2fa6':
             {'url': URL('ftp://lara.uh.cu/')},
         '341938200f949daa356e0b62f747580247609f5a':
             {'url': URL('ftp://nimbo.uh.cu/')},
         'd64f2fc98d015a43da3be34668341e3ee6f79133':
             {'url': URL('ftp://liverpool.reduh.uh.cu/')},
         '0d3465f2b9fd5cf55748797c590ea621e3017a29':
             {'url': URL('ftp://london.reduh.uh.cu/')},
         'c5bcce5953866b673054f8927648d634a7237a9b':
             {'url': URL('ftp://bristol.reduh.uh.cu/')},
     }
     self._results = []
     self._results_per_site = 10
     for site_id, info in self._sites_info.iteritems():
         for name in (str(n) for n in xrange(self._results_per_site)):
             task = CrawlTask(site_id, info['url'].join(name))
             self._results.append(CrawlResult(task, True))
     self._queue = ResultQueue(self._sites_info, self._db_home)
Exemplo n.º 6
0
 def execute(self, task):
     """Execute the task and return the result.
     """
     url = task.url
     try:
         if os.path.isdir(url.path):
             result = CrawlResult(task, True)
             for entry_name in os.listdir(url.path):
                 data = {}
                 entry_url = url.join(entry_name)
                 data['is_dir'] = os.path.isdir(entry_url.path)
                 result.add_entry(entry_url.basename, data)
         else:
             result = CrawlResult(task, False)
     except OSError, error:
         if error.errno in self._errnos_dir:
             self._tasks.report_error_dir(task)
         else:
             self._tasks.report_error_site(task)
         logging.error('Error visiting "%s" (%s)' % (url, error.strerror))
Exemplo n.º 7
0
 def execute(self, task):
     """Execute the task and return the result.
     """
     url = task.url
     try:
         ftp = ftplib.FTP()
         if url.port:
             ftp.connect(url.hostname.encode(self._encoding), url.port)
         else:
             ftp.connect(url.hostname.encode(self._encoding))
         if url.username:
             ftp.login(url.username.encode(self._encoding),
                       url.password.encode(self._encoding))
         else:
             ftp.login()
         try:
             ftp.cwd(url.path.encode(self._encoding))
         except ftplib.error_perm:
             # Failed to change directory.
             result = CrawlResult(task, False)
         else:
             # It seems to be a valid directory.
             result = CrawlResult(task, True)
             entries = []
             callback = lambda line: entries.append(self._parse_list(line))
             ftp.retrlines('LIST', callback)
             for entry_name, is_dir in (entry for entry in entries
                                        if entry is not None):
                 data = {}
                 if is_dir is not None:
                     data['is_dir'] = is_dir
                 else:
                     # The parser does not known if this entry is a
                     # directory or not.  Try to change directory, if error,
                     # assume it is a file.
                     try:
                         entry_url = url.join(entry_name)
                         ftp.cwd(entry_url.path.encode(self._encoding))
                     except ftplib.error_perm:
                         data['is_dir'] = False
                     else:
                         data['is_dir'] = True
                 if not data['is_dir']:
                     content = self._get_content(
                         str(task.url.join(entry_name)))
                     if content:
                         data['content'] = content
                 result.add_entry(entry_name, data)
         ftp.quit()
     except socket.timeout, error:
         self._tasks.report_error_site(task)
         logging.error('Error visiting "%s" (%s)' % (url, error))
Exemplo n.º 8
0
 def execute(self, task):
     """Execute the task and return the result.
     """
     url = task.url
     try:
         if os.path.isdir(url.path):
             result = CrawlResult(task, True)
             for entry_name in os.listdir(url.path):
                 data = {}
                 entry_url = url.join(entry_name)
                 data['is_dir'] = os.path.isdir(entry_url.path)
                 result.add_entry(entry_url.basename, data)
         else:
             result = CrawlResult(task, False)
     except OSError, error:
         if error.errno in self._errnos_dir:
             self._tasks.report_error_dir(task)
         else:
             self._tasks.report_error_site(task)
         logging.error('Error visiting "%s" (%s)' % (url, error.strerror))
Exemplo n.º 9
0
 def execute(self, task):
     """Execute the task and return the result.
     """
     url = task.url
     encoded_url = str(url)
     path_encoded = url.path.encode(self._encoding)
     encoded_url = '%s%s/' % (encoded_url[:-len(path_encoded)],
                              urllib.quote(path_encoded.rstrip('/')))
     opener = urllib2.build_opener()
     opener.addheaders = [('User-agent', 'Arachne/%s' % __version__)]
     try:
         handler = opener.open(encoded_url)
         data = handler.read()
         # Everything seems to be OK, add entries to the result.
         result = CrawlResult(task, True)
         for match in self._ENTRIES_RE.finditer(data):
             entry_data = {}
             entry_data['is_dir'] = (match.group(1).lower() == 'dir')
             entry_name = self._ENTITIES_RE.sub(self._sub_entity,
                                                match.group(2))
             result.add_entry(entry_name, entry_data)
         handler.close()
         self._results.put(result)
         self._tasks.report_done(task)
     except urllib2.HTTPError, error:
         if error.code == 404:
             # The directory does not exists. Generate a not-found result
             # because the entire directory tree should be removed from the
             # index.
             result = CrawlResult(task, False)
             self._results.put(result)
             self._tasks.report_done(task)
         else:
             self._tasks.report_error_dir(task)
             logging.error('Error visiting "%s" (%s: %s)' %
                           (url, error.code, error.msg))