Пример #1
0
 def test_join(self):
     for url_str, is_root, attrs, join_info in self._urls:
         url = URL(url_str, is_root)
         joined_url = url.join(join_info[0])
         self.assertEquals(joined_url.is_root, False)
         # Joined URLs should not be root.
         self.assertEquals(str(joined_url), join_info[1])
Пример #2
0
 def test_join_unicode_args(self):
     for url_str, is_root, attrs, join_info in self._urls:
         url = URL(url_str.decode(self._encoding), is_root)
         joined_url = url.join(join_info[0].decode(self._encoding))
         # Joined URLs should not be root.
         self.assertEquals(joined_url.is_root, False)
         self.assertEquals(str(joined_url), join_info[1])
Пример #3
0
 def test_join_unicode_args(self):
     for url_str, is_root, attrs, join_info in self._urls:
         url = URL(url_str.decode(self._encoding), is_root)
         joined_url = url.join(join_info[0].decode(self._encoding))
         # Joined URLs should not be root.
         self.assertEquals(joined_url.is_root, False)
         self.assertEquals(str(joined_url), join_info[1])
Пример #4
0
 def setUp(self):
     self._db_home = os.path.join(TESTDIR, 'testresultqueue')
     os.mkdir(self._db_home)
     self._sites_info = {
         'a78e6853355ad5cdc751ad678d15339382f9ed21':
             {'url': URL('ftp://atlantis.uh.cu/')},
         '7e019d6f671d336a0cc31f137ba034efb13fc327':
             {'url': URL('ftp://andromeda.uh.cu/')},
         'aa958756e769188be9f76fbdb291fe1b2ddd4777':
             {'url': URL('ftp://deltha.uh.cu/')},
         'd4af25db08f5fb6e768db027d51b207cd1a7f5d0':
             {'url': URL('ftp://anduin.uh.cu/')},
         '886b46f54bcd45d4dd5732e290c60e9639b0d101':
             {'url': URL('ftp://tigris.uh.cu/')},
         'ee5b017839d97507bf059ec91f1e5644a30b2fa6':
             {'url': URL('ftp://lara.uh.cu/')},
         '341938200f949daa356e0b62f747580247609f5a':
             {'url': URL('ftp://nimbo.uh.cu/')},
         'd64f2fc98d015a43da3be34668341e3ee6f79133':
             {'url': URL('ftp://liverpool.reduh.uh.cu/')},
         '0d3465f2b9fd5cf55748797c590ea621e3017a29':
             {'url': URL('ftp://london.reduh.uh.cu/')},
         'c5bcce5953866b673054f8927648d634a7237a9b':
             {'url': URL('ftp://bristol.reduh.uh.cu/')},
     }
     self._results = []
     self._results_per_site = 10
     for site_id, info in self._sites_info.iteritems():
         for name in (str(n) for n in xrange(self._results_per_site)):
             task = CrawlTask(site_id, info['url'].join(name))
             self._results.append(CrawlResult(task, True))
     self._queue = ResultQueue(self._sites_info, self._db_home)
Пример #5
0
 def test_join(self):
     for url_str, is_root, attrs, join_info in self._urls:
         url = URL(url_str, is_root)
         joined_url = url.join(join_info[0])
         self.assertEquals(joined_url.is_root, False)
         # Joined URLs should not be root.
         self.assertEquals(str(joined_url), join_info[1])
Пример #6
0
 def setUp(self):
     self._db_home = os.path.join(TESTDIR, 'testtaskqueue')
     os.mkdir(self._db_home)
     self._request_wait = 2
     self._error_dir_wait = 3
     self._error_site_wait = 4
     self._min_revisit_wait = 2
     self._default_revisit_wait = 4
     self._sites_info = {
         'a78e6853355ad5cdc751ad678d15339382f9ed21': {
             'url': URL('ftp://atlantis.uh.cu/')
         },
         '7e019d6f671d336a0cc31f137ba034efb13fc327': {
             'url': URL('ftp://andromeda.uh.cu/')
         },
         'aa958756e769188be9f76fbdb291fe1b2ddd4777': {
             'url': URL('ftp://deltha.uh.cu/')
         },
         'd4af25db08f5fb6e768db027d51b207cd1a7f5d0': {
             'url': URL('ftp://anduin.uh.cu/')
         },
         '886b46f54bcd45d4dd5732e290c60e9639b0d101': {
             'url': URL('ftp://tigris.uh.cu/')
         },
         'ee5b017839d97507bf059ec91f1e5644a30b2fa6': {
             'url': URL('ftp://lara.uh.cu/')
         },
         '341938200f949daa356e0b62f747580247609f5a': {
             'url': URL('ftp://nimbo.uh.cu/')
         },
         'd64f2fc98d015a43da3be34668341e3ee6f79133': {
             'url': URL('ftp://liverpool.reduh.uh.cu/')
         },
         '0d3465f2b9fd5cf55748797c590ea621e3017a29': {
             'url': URL('ftp://london.reduh.uh.cu/')
         },
         'c5bcce5953866b673054f8927648d634a7237a9b': {
             'url': URL('ftp://bristol.reduh.uh.cu/')
         },
     }
     self._tasks = {}
     self._tasks_per_site = 10
     self._num_sites = len(self._sites_info)
     self._num_tasks = self._num_sites * self._tasks_per_site
     for site_id, info in self._sites_info.iteritems():
         # Set common information.
         info['max_depth'] = 100
         info['request_wait'] = self._request_wait
         info['error_dir_wait'] = self._error_dir_wait
         info['error_site_wait'] = self._error_site_wait
         info['min_revisit_wait'] = self._min_revisit_wait
         info['default_revisit_wait'] = self._default_revisit_wait
         # Create tasks for site.
         task_list = []
         for name in (str(n) for n in xrange(self._tasks_per_site)):
             task_list.append(CrawlTask(site_id, info['url'].join(name)))
         self._tasks[site_id] = task_list
     self._queue = TaskQueue(self._sites_info, self._db_home)
Пример #7
0
    def __init__(self, sites, num_crawlers, spool_dir, database_dir, log_file,
                 log_level, pid_file):
        """Initialize the daemon.

        Creates the `TaskQueue`, `ResultQueue`, `CrawlerManager` and
        `ProcessorManager` instances.  The `sites` argument should be a list
        with the information for each site.
        """
        Daemon.__init__(self, pid_file=pid_file)
        logging.basicConfig(filename=log_file,
                            level=log_level,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')
        logging.info('Starting Arachne daemon %s' % __version__)
        logging.info('Running for %d sites' % len(sites))
        # Create URL instances and assign an id to each site.
        self._sites_info = {}
        for site in sites:
            site['url'] = URL(site['url'], True)
            self._sites_info[hashlib.sha1(str(site['url'])).hexdigest()] = site
        # Create or check required directories.
        self._results_dir = os.path.join(spool_dir, 'results')
        if not os.path.isdir(self._results_dir):
            os.mkdir(self._results_dir)
        self._tasks_dir = os.path.join(spool_dir, 'tasks')
        if not os.path.isdir(self._tasks_dir):
            os.mkdir(self._tasks_dir)
        self._database_dir = database_dir
        self._num_crawlers = num_crawlers
        self._running = False
Пример #8
0
 def setUp(self):
     url = URL('ftp://deltha.uh.cu/')
     site_id = 'aa958756e769188be9f76fbdb291fe1b2ddd4777'
     self._num_entries = 10
     self._found = True
     self._task = CrawlTask(site_id, url)
     self._entries = [(str(i), {'is_dir': i < (self._num_entries / 2)})
                      for i in range(self._num_entries)]
     self._result = CrawlResult(self._task, self._found)
Пример #9
0
 def test_properties_unicode_args(self):
     for url_str, is_root, attrs, join_info in self._urls:
         url = URL(url_str.decode(self._encoding), is_root)
         self.assertEquals(url.is_root, is_root)
         self.assertEquals(url.scheme, attrs[0])
         self.assertEquals(url.username, attrs[1])
         self.assertEquals(url.password, attrs[2])
         self.assertEquals(url.hostname, attrs[3])
         self.assertEquals(url.port, attrs[4])
         self.assertEquals(url.path, attrs[5])
         self.assertEquals(url.dirname, attrs[6])
         self.assertEquals(url.basename, attrs[7])
Пример #10
0
 def test_pickling(self):
     for url_str, is_root, attrs, join_info in self._urls:
         url = pickle.loads(pickle.dumps(URL(url_str, is_root)))
         self.assertEquals(url.is_root, is_root)
         self.assertEquals(url.scheme, attrs[0])
         self.assertEquals(url.username, attrs[1])
         self.assertEquals(url.password, attrs[2])
         self.assertEquals(url.hostname, attrs[3])
         self.assertEquals(url.port, attrs[4])
         self.assertEquals(url.path, attrs[5])
         self.assertEquals(url.dirname, attrs[6])
         self.assertEquals(url.basename, attrs[7])
Пример #11
0
 def test_type_unicode(self):
     for url_str, is_root, attrs, join_info in self._urls:
         url = URL(url_str, is_root)
         self.assertTrue(type(url.scheme) is unicode)
         if attrs[1] is not None:
             self.assertTrue(type(url.username) is unicode)
         if attrs[2] is not None:
             self.assertTrue(type(url.password) is unicode)
         if attrs[3] is not None:
             self.assertTrue(type(url.hostname) is unicode)
         self.assertTrue(type(url.path) is unicode)
         self.assertTrue(type(url.dirname) is unicode)
         self.assertTrue(type(url.basename) is unicode)
         pickled_url = pickle.loads(pickle.dumps(url))
         self.assertTrue(type(pickled_url.scheme) is unicode)
         if attrs[1] is not None:
             self.assertTrue(type(pickled_url.username) is unicode)
         if attrs[2] is not None:
             self.assertTrue(type(pickled_url.password) is unicode)
         if attrs[3] is not None:
             self.assertTrue(type(pickled_url.hostname) is unicode)
         self.assertTrue(type(pickled_url.path) is unicode)
         self.assertTrue(type(pickled_url.dirname) is unicode)
         self.assertTrue(type(pickled_url.basename) is unicode)
Пример #12
0
 def setUp(self):
     self._url = URL('ftp://deltha.uh.cu/')
     self._site_id = 'aa958756e769188be9f76fbdb291fe1b2ddd4777'
     self._task = CrawlTask(self._site_id, self._url)