Exemplo n.º 1
0
    def start(self, delay_in_sec=0):
        """
        Starts the intervalmanager, which starts
        the crawlmanager procedure with a given delay

        :delay_in_sec: delay time in seconds
        """
        # fetch interval from config in seconds
        self.__interval = float(config.get('crawler.interval')) * 60

        # delay before next crawl
        if delay_in_sec != 0:
            logging.info("""
                  Next crawl will start in {0} seconds."""
                  .format(delay_in_sec))
            time.sleep(delay_in_sec)

        if self.status != 'stop':
            self.__cmanager = c.CrawlerManager(utl.unique_items_from_file())
            self.__start_time = times.get_localtime_sec()
            self.__cmanager.register_done(self.crawling_done_callback)
            self.__set_status('active')
            self.__cmanager.start()
        else:
            self.__set_status('ready')
            logging.info('Interval Manager finished - current state set to ready.')
Exemplo n.º 2
0
 def __init__(self, urls):
     """
     :urls: set of urls that will be crawled
     """
     self.__die = False
     if len(urls) > 0:
         self.__done_callback = None
         self.__urls = list(urls)
         self.__pool = mpool.ThreadPool(config.get('crawler.maxInst'))
Exemplo n.º 3
0
    def __init__(self):
        """
        Collected arguments
        """
        self.__filelock = lock.FileLock(LOCKFILE, folder=config.get('general.root'), timeout=0.1)
        self._args = docopt(__doc__, version='Archive 1.0')
        submodules = {
                'init': self.handle_init,
                'crawler': self.handle_crawler,
                'javadapter': self.handle_javadapter,
                'db': self.handle_db,
                'config': self.handle_config,
                'repair': self.handle_repair
                }

        try:
            loglevel = self._args['--loglevel']
            if loglevel is not None:
                loglevel = loglevel.upper()
                severity = getattr(logging, loglevel)
            else:
                raise KeyError
        except KeyError:
            severity = logging.INFO
        except AttributeError:
            print('Error: \"loglevel\" is not valid severity level')
            print(__doc__)
            sys.exit(-1)

        try:
            logging.basicConfig(level=severity,
                    filename=os.path.join(paths.get_log_dir(), 'archive.log'),
                                format='%(asctime)s - %(levelname)s - %(message)s')
            pass
        except IOError as err:
            # Disable warning for initialization
            if self._args['init'] is False:
                print('Cannot open log - file structure probably does not exist yet:', err)

        # Set up config to another file if desired
        try:
            if self._args['init'] is False and self._args['--config']:
                config.load(os.path.abspath(self._args['--config']))
            elif self._args['init'] is False:
                config.load('webarchive.conf.xml')
        except IOError as err:
            print('FATAL: Unable to locate config:', err)
            sys.exit(-4)

        # iterating through arguments
        for module, handler in submodules.items():
            if self._args[module]:
                try:
                    handler()
                except lock.FileLockException:
                    print("archive is currently locked with global.lock.")
                    sys.exit(0)
Exemplo n.º 4
0
    def load(self, plugin_path=None):
        """
        Load a list of \*.py files from a directory

        The .py files are read in and stored in-memory

        :plugin_path: A path to a directory with .py files
                      or None to read the value from the cfg
        """
        # If no path is given,
        # we'll try to read it from the cfg
        if plugin_path is None:
            actual_path = os.path.join(config.get('general.root'), config.get('general.filterpath'))
        else:
            actual_path = plugin_path

        # Built a list of (path_to_filter, filter_source)
        for source in sorted(glob.glob(os.path.join(actual_path, '*.py'))):
            with open(source, 'r') as handle:
                self.__source_list.append(
                    (source, handle.read() + '\n'))
Exemplo n.º 5
0
 def handle_config(self):
     """
     Invokes Config Handler operations
     """
     if self._args['--get']:
         print(config.get(self._args['--get']))
     elif self._args['--set']:
         config.set(self._args['--set'], self._args['<value>'])
     elif self._args['--default']:
         print(config.get_default(self._args['--default']))
     elif self._args['--config']:
         config.load(self._args['--config'])
Exemplo n.º 6
0
def start(host='localhost', port=config.get('javadapter.port')):
    """
    Start the Javadapter server, and exit once done

    :host: the host to start the server on (does anythinh but localhost work?)
    :port: the port on which the server listens on
    :returns: a server, on which shutdown() can be called
    """
    # Spawn a new thread for each connection
    server = ThreadedTCPServer((host, port), AdapterHandler)
    server_thread = threading.Thread(target=server.serve_forever)
    server_thread.daemon = True
    server_thread.start()

    return server
Exemplo n.º 7
0
 def __init__(self):
     self.__pickle_path = os.path.join(config.get('general.root'), 'pickle_cache')
Exemplo n.º 8
0
def get_urllist_path():
    """
    :returns: return path to url.txt
    """
    return os.path.join(config.get('general.root'), config.get('crawler.urllistpath'))
Exemplo n.º 9
0
def get_temp_root():
    """
    :returns: temp dir path
    """
    return os.path.join(config.get('general.root'), config.get('crawler.tempRoot'))
Exemplo n.º 10
0
def get_log_dir():
    """
    :returns: log dir path
    """
    return os.path.join(config.get('general.root'), 'logs/')
Exemplo n.º 11
0
def get_archive_root():
    """
    :returns: archive root path
    """
    return config.get('general.root')
Exemplo n.º 12
0
def get_content_root():
    """
    :return: '{archive_root}/content/' path
    """
    return os.path.join(config.get('general.root'), 'content/')
Exemplo n.º 13
0
def get_sqlpath():
    """
    :returns: sql statements path
    """
    return os.path.join(config.get('general.root'), config.get('db.sqlSource'))
Exemplo n.º 14
0
def get_dbpath():
    """
    :returns: db path
    """
    return os.path.join(config.get('general.root'), config.get('db.path'))
Exemplo n.º 15
0

def get_urllist_path():
    """
    :returns: return path to url.txt
    """
    return os.path.join(config.get('general.root'), config.get('crawler.urllistpath'))

###########################################################################
#                                unittest                                 #
###########################################################################


if __name__ == '__main__':
    # some predefined values to test with
    ROOT = config.get('general.root')
    SQL_PATH = 'sql/'
    DB_PATH = 'metadata.db'
    CRAWLER_TMP_ROOT = 'tmp/'

    CONTENT_DIR = 'content/'
    LOG_DIR = 'logs/'
    TEMP_DIR = 'tmp/'
    DOMAIN = 'www.domain_name.org'
    URL = 'url.txt'

    class TestPaths(unittest.TestCase):

        def test_get_db_path(self):
            self.assertEqual(get_dbpath(), os.path.join(ROOT, DB_PATH))