Exemplo n.º 1
0
 def _compile_regex(self, yamlconfig, re_module):
     patterns = []
     # compile all search patterns
     strict = yamlconfig.get('strict_regex', False)
     regexes = yamlconfig['search']
     logger.debug("compiling {} regexes ...".format(len(regexes)))
     for regex in regexes:
         try:
             search = regex['search']
             ps = PastieSearch(re_module, regex)
             patterns.append(ps)
         except KeyError:
             if strict:
                 raise PystemonConfigException("Missing search pattern")
             else:
                 logger.error("Error: skipping empty search pattern entry")
         except Exception as e:
             if strict:
                 raise PystemonConfigException(
                     "Unable to parse regex '%s': %s" % (search, e))
             else:
                 logger.error("Error: Unable to parse regex '%s': %s" %
                              (search, e))
     logger.debug("successfully compiled {0}/{1} regexes".format(
         len(patterns), len(regexes)))
     return patterns
Exemplo n.º 2
0
 def _load_regex_engine(self, yamlconfig):
     # load the regular expression engine
     engine = yamlconfig.get('engine', 're')
     re_module = None
     if not engine in ['re', 'regex']:
         raise PystemonConfigException("only 're' or 'regex' supported, not '{0}'".format(engine))
     try:
         logger.debug("Loading regular expression engine '{0}'".format(engine))
         re_module=importlib.import_module(engine)
         if engine == 'regex':
             logger.debug("Setting regex DEFAULT_VERSION to VERSION1")
             re_module.DEFAULT_VERSION = re_module.VERSION1
     except ImportError as e:
         raise PystemonConfigException("unable to import module '{0}'".format(engine))
     return re_module
Exemplo n.º 3
0
    def _load_storage_engines(self, yamlconfig):
        # initialize storage backends
        storage_engines = []
        storage_yamlconfig = yamlconfig.get('storage', {})
        save_dir = None
        archive_dir = None
        storage_file = None
        compress = False
        # file storage is the default and should be initialized first to set save_dir and archive_dir
        try:
            storage_file = PastieStorage.load_storage(
                'archive', **storage_yamlconfig.pop('archive'))
            if storage_file is not None:
                save_dir = storage_file.save_dir
                archive_dir = storage_file.archive_dir
                compress = storage_file.compress
                storage_engines.append(storage_file)
        except KeyError as e:
            raise PystemonConfigException(
                'archive was not found under storage, old pystemon.yaml config?'
            )

        for storage in storage_yamlconfig.keys():
            engine = PastieStorage.load_storage(storage,
                                                save_dir=save_dir,
                                                archive_dir=archive_dir,
                                                **storage_yamlconfig[storage])
            if engine is not None:
                storage_engines.append(engine)
        return {
            'save_dir': save_dir,
            'archive_dir': archive_dir,
            'compress': compress,
            'engines': storage_engines
        }
Exemplo n.º 4
0
 def reload(self):
     try:
         with self.lock:
             if self._reload_count:
                 logger.debug("reloading configuration file '{0}'".format(self._configfile))
                 self._yamlconfig = None
             else:
                 logger.debug("loading configuration file '{0}'".format(self._configfile))
             self._reload_count = self._reload_count + 1
             self._preload()
             config = self._reload()
             self._ip_addr = config.get('ip_addr')
             self._sendmail = config.get('sendmail')
             self._save_thread = config.get('save_thread')
             self._user_agents_list = config.get('user_agents_list')
             self._storage_engines = config.get('storage_engines')
             self._save_dir = config.get('save_dir')
             self._archive_dir = config.get('archive_dir')
             self._compress = config.get('compress')
             self._proxies_list = config.get('proxies_list')
             self._re_module = config.get('re_module')
             self._patterns = config.get('patterns')
             self._sites = config.get('sites')
             self._threads = config.get('threads')
             self._pidfile = config.get('pidfile')
             self._max_throttling = 0
             for site in self._sites:
                 if self._max_throttling < site.throttling:
                     self._max_throttling = site.throttling
     except PystemonConfigException:
         raise
     except Exception as e:
         raise PystemonConfigException('Unable to parse configuration: {}'.format(e))
     logger.debug("configuration loaded")
     return True
Exemplo n.º 5
0
    def _reload(self):
        logger.debug("parsing yaml configuration from file '{}'".format(
            self._configfile))
        config = {}
        yamlconfig = self._yamlconfig
        try:
            if yamlconfig['proxy']['random']:
                config['proxies_list'] = ProxyList(yamlconfig['proxy']['file'])
        except KeyError:
            pass

        config['save_thread'] = yamlconfig.get('save-thread', False)

        uaconfig = yamlconfig.get('user-agent', {})
        if uaconfig.get('random', False):
            try:
                config['user_agents_list'] = self._load_user_agents_from_file(
                    yamlconfig['user-agent']['file'])
            except KeyError:
                raise PystemonConfigException(
                    'random user-agent requested but no file provided')

        try:
            ip_addr = yamlconfig['network']['ip']
        except KeyError:
            logger.debug("Using default IP address")
            pass

        config['sendmail'] = self._load_email(yamlconfig)
        res = self._load_storage_engines(yamlconfig)
        config['storage_engines'] = res['engines']
        config['save_dir'] = res['save_dir']
        config['archive_dir'] = res['archive_dir']
        config['compress'] = res['compress']
        config['re_module'] = self._load_regex_engine(yamlconfig)
        config['patterns'] = self._compile_regex(yamlconfig,
                                                 config['re_module'])
        try:
            config['threads'] = int(yamlconfig.get('threads', 1))
            if config['threads'] < 1:
                raise Exception("minimum acceptable value is 1")
        except Exception as e:
            logger.error("invalid threads value specified: {0}".format(e))
            config['threads'] = 1
            pass

        config['sites'] = self._load_sites(yamlconfig)

        if not self.debug and 'logging-level' in yamlconfig:
            if yamlconfig['logging-level'] in [
                    'NOTSET', 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'
            ]:
                logger.setLevel(
                    logging.getLevelName(yamlconfig['logging-level']))
            else:
                logger.error("logging level \"%s\" is invalid" %
                             (yamlconfig['logging-level']))

        logger.debug("yaml configuration parsed")
        return config
Exemplo n.º 6
0
 def _load_user_agents_from_file(self, filename):
     user_agents_list = []
     logger.debug('Loading user-agent from file "{file}" ...'.format(file=filename))
     with open(filename) as f:
         for line in f:
             line = line.strip()
             if line:
                 user_agents_list.append(line)
     if not len(user_agents_list) > 0:
         raise PystemonConfigException("found zero valid UserAgents")
     logger.debug("Found {count} UserAgents in file '{file}'".format(file=filename, count=len(user_agents_list)))
     return user_agents_list
Exemplo n.º 7
0
 def _load_yamlconfig(self, configfile):
     yamlconfig = None
     try:
         if self._recent_pyyaml():
             # https://github.com/yaml/pyyaml/wiki/PyYAML-yaml.load(input)-Deprecation
             # only for 5.1+
             yamlconfig = yaml.load(open(configfile), Loader=yaml.SafeLoader)
         else:
             yamlconfig = yaml.safe_load(open(configfile))
     except yaml.YAMLError as exc:
         logger.error("Error in configuration file {0}:".format(configfile))
         if hasattr(exc, 'problem_mark'):
             mark = exc.problem_mark
             raise PystemonConfigException("error position: (%s:%s)" % (mark.line + 1, mark.column + 1))
     for includes in yamlconfig.get("includes", []):
         try:
             logger.debug("loading include '{0}'".format(includes))
             yamlconfig.update(yaml.safe_load(open(includes)))
         except Exception as e:
             raise PystemonConfigException("failed to load '{0}': {1}".format(includes, e))
     return yamlconfig