def __init__(self, model: dict): # Load external if model: if isinstance(model, str): _model = detect_actions(model) if not model: raise CrawlinoValueError("Invalid model input values", exc_info=True, extra={"input_model": model}) else: model = _model # Inline declaration else: self.name = gt(model, "name", None) if not self.name: raise CrawlinoValueError("Error in Models: Models must " "have 'type' property.") self.fields = CMModelsFields(gt(model, "fields", None)) self.mappers = CMModelsMappers(gt(model, "mappers", None))
def __init__(self, type: str, config: Dict or None, name: str = None): self.type = type self.name = name or "" self.config = config or {} if CrawlinoModulesStore.find_module("hooks", self.type) is None: raise CrawlinoValueError("Invalid 'type' property value", exc_info=True, extra={ "given_source_type": self.type }) if self.config is None: raise CrawlinoValueError("Input must has a 'config' property")
def __init__(self, type: str, config: Dict or None, name: str = None): self.type = type self.name = name or "" self.config = config or {} if not self.type: raise CrawlinoValueError("Config must has the 'type' property") if self.config is None: raise CrawlinoValueError("Source must has a 'config' property") if CrawlinoModulesStore.find_module(STEP_EXTRACTOR, self.type) is None: raise CrawlinoValueError( f"Invalid 'type' property value: " f"'{self.type}'", exc_info=True, extra={"input_type": self.type})
def __init__(self, fields: List[dict]): self._raw_data = fields self.mappers = {} for m in fields: # Get the key ks = list(m.keys()) if len(ks) != 1: raise CrawlinoFormatError( "Invalid mapper format. Each map, " "only can have one dictionary value", exc_info=True, extra={"map_value": str(m)}) key_action = ks[0] # Determinate what sub-class build try: map_obj = self.MAPPERS[key_action](**m[key_action]) except KeyError: raise CrawlinoValueError("Invalid mapper", exc_info=True, extra={"mapper_name": key_action}) except TypeError as e: invalid_arg = e.args[0][e.args[0].rfind("argument") + len("argument"):] raise CrawlinoValueError( "Invalid mapper. Mapper destination " "doesn't required property", exc_info=True, extra={ "invalid_property": invalid_arg, "mapper_name": key_action }) # Storage the object self.mappers[map_obj.name] = map_obj
def generator_numeric(*args, **kwargs): """This generator create sequences of numbers from: art[0] to arg[1]""" log.debug("Numeric generator plugin") start, end, *_ = args if start > end: raise CrawlinoValueError( f"Start range in higher than lower, no data could be generated - " f"start: {start} - end: {end}") for x in range(start, end): yield x
def __init__(self, config: dict): # # Mandatory args # for x in ("name", "mapTo", "inputVar"): try: setattr(self, un_camel(x), config[x]) except KeyError: raise CrawlinoValueError( f"Keyword '{x}' is necessary in the ruleSet definition") try: raw_rules = config["rules"] except KeyError: raise CrawlinoValueError( f"you must define at least one rule in a 'ruleSet' entry") else: self.rules = OrderedDict() for i, rule in enumerate(raw_rules): position = str(rule.get("config", {}).get("order", i)) if position in self.rules.keys(): raise CrawlinoValueError( f"conflict in order parameter for rules in ruleSet " f"'{self.name}': already is an element with " f"position '{position}'") self.rules[position] = CMRule(type=rule.get("type"), config=rule.get("config"), name=rule.get("name")) # # Optional # self.description = config.get("description", "") self.exit_on_match = config.get("exitOnMatch", True) self.report = config.get("report", "group")
def generator_random(*args, **kwargs): """ Input parameters: arg[0]: generated type: string, number arg[1]: generated value len arg[2]: total of random values generated """ generated_type, maximum, total = args space = string.digits if generated_type == "string": space += string.ascii_letters if total <= 0: raise CrawlinoValueError( f"Total generated values must be bigger than 0") for _ in range(total): yield "".join(random.choice(space) for _ in range(maximum))
def input_raw_socket(prev_step: PluginReturnedData, **kwargs) \ -> PluginReturnedData: log.debug("Starting plugin - input::raw-socket") allowed_inputs = ("ip", "web", "domain", "url") allowed_proto = ("tcp", "udp") # Load data prev_config = dict_to_object(prev_step.to_dict) if prev_config.source_type not in allowed_inputs: raise CrawlinoValueError(f"This plugin only works with: " f"{'|'.join(allowed_inputs)}") # ------------------------------------------------------------------------- # Extract config # ------------------------------------------------------------------------- config = kwargs.get("config", {}) port_to_test = config.get("port", None) data_to_send = config.get("data", None) connection_timeout = config.get("timeout", "0.5") port_proto = "tcp" # # Check proto # if config.get("proto", None): if config.get("proto", None) not in allowed_proto: raise CrawlinoValueError(f"This plugin only works with: " f"{'|'.join(allowed_proto)}") port_proto = config.get("proto") if port_proto == "tcp": proto = socket.SOCK_STREAM else: proto = socket.SOCK_DGRAM # # Checking timeout # try: timeout = float(connection_timeout) except ValueError: raise CrawlinoValueError( "Invalid timeout value. It must be a float falue") # # Extract target # if prev_config.source_type == "ip": ip = prev_config.target else: ip, *_ = urllib.parse.urlparse(prev_config.target).netloc.split(":") # # Do connection # if not data_to_send: data_to_send = b"\r\n\r\n" else: data_to_send = data_to_send.encode() log.debug(f"Connecting to {ip}:{port_to_test}...") with socket.socket(socket.AF_INET, proto) as s: s.settimeout(timeout) s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) code = s.connect_ex((ip, int(port_to_test))) if code == 0: # 0 = Open try: s.sendall(data_to_send) d, _, _, _ = s.recvmsg(100000) received_data = d.decode(errors="ignore") status = "open" except socket.timeout: log.error(f"Port {port_to_test} is open but it got a " f"timeout when try to get data from socket") else: received_data = None status = "closed/filtered" d = PluginReturnedData( **dict(host=ip, status=status, data=received_data, port=port_to_test)) return d
def __init__(self, paths: str or List[str], default_crawler_extension: str = "yaml", concurrency: int = 1, concurrency_type: str = "threads", environment_vars: List[str] = None, environment_file: str = None, crawlers_templates_path: List[str] or None = None): if not default_crawler_extension: self.default_crawler_extension: str = "yaml" else: self.default_crawler_extension: str = default_crawler_extension # --------------------------------------------------------------------- # Paths # --------------------------------------------------------------------- if isinstance(paths, list): tmp_paths = paths else: tmp_paths = [paths if paths else ""] # Expand any Glob in paths: *.py -> 1.py, 2.py... self.paths = [op.abspath(e) for x in tmp_paths for e in glob.glob(x)] try: con = int(concurrency) except ValueError: con = 1 self.concurrency: int = 1 if con < 1 else con if concurrency_type not in self.CONCURRENCY_MODES: raise CrawlinoValueError(f"Invalid concurrency type. Allowed types" f" are: " f"{'|'.join(self.CONCURRENCY_MODES)}") self.concurrency_type = concurrency_type self.crawlers_templates_path = [ op.abspath(op.join(op.dirname(__file__), "..", "crawlers_templates")) ] if crawlers_templates_path: if not isinstance(crawlers_templates_path, list): crawlers_templates_path = [crawlers_templates_path] self.crawlers_templates_path.extend(crawlers_templates_path) # --------------------------------------------------------------------- # Set environment vars # --------------------------------------------------------------------- self.environment_vars = [] self.environment_file = environment_file if self.environment_file: self.environment_file = op.abspath(environment_file) with open(self.environment_file, "r") as f: self.environment_vars.extend(f.read().splitlines()) if environment_vars: self.environment_vars.extend(environment_vars) # Remove duplicates self.environment_vars = list(set(self.environment_vars)) if self.environment_vars: for v in self.environment_vars: if "=" not in v: raise CrawlinoFormatError( f"Environment vars must be set as format: VAR=VALUE. " f"Got: '{v}'") try: var_name, var_value = v.split("=") except ValueError: raise CrawlinoFormatError( f"Environment vars must be set as format: VAR=VALUE. " f"Got: '{v}'") log.debug(f"Setting environment var '{var_name}' with value " f"'{var_value}'") os.environ[var_name] = var_value log.info(f"Working mode '{self.concurrency_type}' with " f"concurrency '{self.concurrency}'") log.info(f"Selected {len(self.crawlers_templates_path)} " f"crawlers paths") log.info(f"Default crawler extension selected: " f"'{self.default_crawler_extension}")