def handle_results(self, results, session_key, in_preview): # FYI: we ignore results since this is a generating command # Make sure that the url field was provided if self.url is None: self.logger.warn("No url was provided") return # Parse the URL url_field = URLField('name', 'title', 'description') url_parsed = url_field.to_python(self.url) # Do the web-ping result = WebPing.ping(url_parsed, logger=self.logger, should_contain_string=self.expected_string, return_headers=self.return_headers) # Prep the result dictionary data = { 'response_code': result.response_code if result.response_code > 0 else '', 'total_time': round(result.request_time, 2) if result.request_time > 0 else '', 'timed_out': result.timed_out, 'url': result.url } # Add the MD5 of the response if available if result.response_md5 is not None: data['content_md5'] = result.response_md5 # Add the SHA-224 of the response if available if result.response_sha224 is not None: data['content_sha224'] = result.response_sha224 # Add the size of the response if available if result.response_size is not None: data['content_size'] = result.response_size # Add the variable noting if the expected string was present if result.has_expected_string is not None: data['has_expected_string'] = str( result.has_expected_string).lower() # Add the the headers if present if result.headers is not None: for header in result.headers: data['header_' + header] = result.headers[header] # Output the results self.output_results([data])
def __init__(self, timeout=30): scheme_args = {'title': "Website Availability Check", 'description': "Connects to a website in order to obtain performance statistics", 'use_external_validation': "true", 'streaming_mode': "xml", 'use_single_instance': "true"} args = [ Field("title", "Title", "A short description (typically just the domain name)", empty_allowed=False), URLField("url", "URL", "The URL to connect to (must be be either HTTP or HTTPS protocol)", empty_allowed=False), DurationField("interval", "Interval", "The interval defining how often to perform the check; can include time units (e.g. 15m for 15 minutes, 8h for 8 hours)", empty_allowed=False), Field("configuration", "Configuration", "Defines a specific proxy configuration to use (in website_monitoring.spec) if not using the default; only used if you want to have multiple proxy servers", none_allowed=True, empty_allowed=True), Field("client_certificate", "Client Certificate Path", "Defines the path to the client certificate (if the website requires client SSL authentication)", none_allowed=True, empty_allowed=True), Field("client_certificate_key", "Client Certificate Key Path", "Defines the path to the client certificate key (necessary of the key is in a separate file from the certificate)", none_allowed=True, empty_allowed=True), Field("username", "Username", "The username to use for authenticating (only HTTP authentication supported)", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False), Field("password", "Password", "The password to use for authenticating (only HTTP authentication supported)", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False) ] ModularInput.__init__( self, scheme_args, args, logger_name='web_availability_modular_input' ) if timeout > 0: self.timeout = timeout else: self.timeout = 30
def handle_results(self, results, session_key, in_preview): # FYI: we ignore results since this is a generating command # Make sure that the url field was provided if self.url is None: self.logger.warn("No url was provided") return # Parse the URL url_field = URLField('name','title','description') url_parsed = url_field.to_python(self.url) # Do the web-ping result = WebPing.ping(url_parsed, logger=self.logger, should_contain_string=self.expected_string) # Prep the result dictionary data = { 'response_code': result.response_code if result.response_code > 0 else '', 'total_time': round(result.request_time, 2) if result.request_time > 0 else '', 'timed_out': result.timed_out, 'url': result.url } # Add the MD5 of the response of available if result.response_md5 is not None: data['content_md5'] = result.response_md5 # Add the SHA-224 of the response of available if result.response_sha224 is not None: data['content_sha224'] = result.response_sha224 # Add the MD5 of the response of available if result.response_size is not None: data['content_size'] = result.response_size # Add the variable noting if the expected string was present if result.has_expected_string is not None: data['has_expected_string'] = str(result.has_expected_string).lower() # Output the results self.output_results([data])
def __init__(self): scheme_args = {'title': "Syndication Feed (RSS, ATOM, RDF)", 'description': "Import syndication feeds (RSS, ATOM, RDF)", 'use_external_validation': "true", 'streaming_mode': "xml", 'use_single_instance': "true"} args = [ URLField("url", "Feed URL", "The URL of the feed to input", empty_allowed=False), BooleanField("include_only_changed", "Include only new or changed entries", "Only include entries that has not been indexed yet (won't get items that were already observed)", empty_allowed=False), Field("username", "Username", "The username to use for authenticating (only HTTP authentication supported)", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False), Field("password", "Password", "The password to use for authenticating (only HTTP authentication supported)", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False), DurationField("interval", "Interval", "The interval defining how often to import the feed; can include time units (e.g. 15m for 15 minutes, 8h for 8 hours)", empty_allowed=False), BooleanField("clean_html", "Convert HTML to Text", "Convert HTML to human readable text", empty_allowed=False) ] ModularInput.__init__( self, scheme_args, args, logger_name='syndication_modular_input' )
def __init__(self, timeout=30, **kwargs): scheme_args = { 'title': "Web-pages", 'description': "Retrieve information from web-pages", 'use_external_validation': "true", 'streaming_mode': "xml", 'use_single_instance': "true" } args = [ Field("title", "Title", "A short description (typically just the domain name)", empty_allowed=False), URLField( "url", "URL", "The URL to connect to (must be be either HTTP or HTTPS protocol)", empty_allowed=False, require_https_on_cloud=True), DurationField( "interval", "Interval", "The interval defining how often to perform the check; can include time units (e.g. 15m for 15 minutes, 8h for 8 hours)", empty_allowed=False), IntegerField("timeout", "Timeout", 'The timeout (in number of seconds)', none_allowed=True, empty_allowed=True), SelectorField( "selector", "Selector", "A selector that will match the data you want to retrieve", none_allowed=True, empty_allowed=True), # HTTP client options Field("user_agent", "User Agent", "The user-agent to use when communicating with the server", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False), Field("browser", "Browser", 'The browser to use', none_allowed=True, empty_allowed=True), # Output options ListField("name_attributes", "Field Name Attributes", "A list of attributes to use for assigning a field name", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False), BooleanField("use_element_name", "Use Element Name as Field Name", "Use the element's tag name as the field name", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False), BooleanField("output_as_mv", "Output as Multi-value Field", "Output the matches as multi-value field", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False), StaticListField("output_results", "Indicates when results output should be created", "Output the matches only when results changed", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False, valid_values=WebInput.OUTPUT_RESULTS_OPTIONS), BooleanField("raw_content", "Raw content", "Return the raw content returned by the server", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False), BooleanField("empty_matches", "Empty matches", "Include empty rows (otherwise, they are excluded)", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False), Field( "text_separator", "Text Separator", 'A string that will be placed between the extracted values (e.g. a separator of ":" for a match against "<a>tree</a><a>frog</a>" would return "tree:frog")', none_allowed=True, empty_allowed=True), # Spidering options IntegerField( "page_limit", "Discovered page limit", "A limit on the number of pages that will be auto-discovered", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False), IntegerField( "depth_limit", "Depth limit", "A limit on how many levels deep the search for pages will go", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False), Field( "url_filter", "URL Filter", "A wild-card that will indicate which pages it should search for matches in", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False), # Authentication options Field("username", "Username", "The username to use for authenticating", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False), Field("password", "Password", "The password to use for authenticating", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False), Field("username_field", "Username field", "The name of the username field on the login form", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False), Field("password_field", "Password field", "The name of the password field on the login form", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False), URLField("authentication_url", "Authentication URL", "The URL of the login form", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False, require_https_on_cloud=True) ] ModularInput.__init__(self, scheme_args, args, logger_name='web_input_modular_input', logger_level=logging.INFO) if timeout > 0: self.timeout = timeout else: self.timeout = 30
def scrape_page(self, url, selector, name_attributes=[], output_matches_as_mv=True, output_matches_as_separate_fields=False, include_empty_matches=False, use_element_name=False, page_limit=1, depth_limit=50, url_filter=None, include_raw_content=False, text_separator=None, browser=None, additional_fields=None, match_prefix=None, empty_value='NULL', https_only=False, output_fx=None): """ Retrieve data from a website. Arguments: url -- The url to connect to. This object ought to be an instance derived from using urlparse selector -- A CSS selector that matches the data to retrieve name_attributes -- Attributes to use the values for assigning the names output_matches_as_mv -- Output all of the matches with the same name ("match") output_matches_as_separate_fields -- Output all of the matches as separate fields ("match1", "match2", etc.) include_empty_matches -- Output matches that result in empty strings use_element_name -- Use the element as the field name page_limit -- The page of pages to limit matches to depth_limit == The limit on the depth of URLs found url_filter -- A wild-card to limit the extracted URLs to include_raw_content -- Include the raw content (if true, the 'content' field will include the raw content) text_separator -- The content to put between each text node that matches within a given selector browser -- The browser to use additional_fields -- Additional fields to put into the result set match_prefix -- A prefix to attach to prepend to the front of the match fields empty_value -- The value to use for empty matches https_only -- Only extract links that use HTTPS output_fx -- Run this function against the results for outputting them """ if isinstance(url, string_types): url = URLField.parse_url(url, "url") if isinstance(selector, string_types): selector = SelectorField.parse_selector(selector, "selector") if self.logger is not None: self.logger.info('Running web input, url="%s"', url.geturl()) results = [] results_count = 0 client = None try: # Make the browser client if necessary if browser == WebScraper.FIREFOX: client = FirefoxClient(timeout=self.timeout, user_agent=self.user_agent, logger=self.logger) elif browser == WebScraper.CHROME: client = ChromeClient(timeout=self.timeout, user_agent=self.user_agent, logger=self.logger) else: client = DefaultWebClient(self.timeout, user_agent=self.user_agent, logger=self.logger) # Setup the proxy client.setProxy(self.proxy_type, self.proxy_server, self.proxy_port, self.proxy_user, self.proxy_password) # Setup credentials client.setCredentials(self.username, self.password) # Do form authentication if self.username is not None and self.password is not None and self.authentication_url is not None: client.doFormLogin(self.authentication_url.geturl(), self.username_field, self.password_field) # Run the scraper and get the results extracted_links = OrderedDict() extracted_links[url.geturl()] = DiscoveredURL(0) #if self.logger is not None: # self.logger.info('Running web input, url="%s"', url.geturl()) # Process each result while results_count < page_limit: source_url_depth = 0 url = None for k, v in extracted_links.items(): if v.processed == False: url = k source_url_depth = v.depth # Track that the URL was checked since we are going to process it extracted_links[k].processed = True # Since we found one, stop looking for one to process break # Stop if we have no more URLs to process if url is None: if self.logger is not None: self.logger.debug("No more URLs in the list to process") break # Make the keyword argument list kw = { 'url_filter' : url_filter, 'source_url_depth': source_url_depth, 'include_raw_content': include_raw_content, 'text_separator': text_separator, 'browser': browser, 'extracted_links': extracted_links, 'match_prefix': match_prefix, 'empty_value': empty_value } # Don't have the function extract URLs if the depth limit has been reached if source_url_depth >= depth_limit: kw['extracted_links'] = None # Perform the scrape result = self.get_result_single(client, urlparse(url), selector, name_attributes, output_matches_as_mv, output_matches_as_separate_fields, include_empty_matches, use_element_name, additional_fields=additional_fields, **kw) # Append the result if result is not None: if output_fx is None: results.append(result) results_count = len(results) else: output_fx(result) results_count = results_count + 1 except (LoginFormNotFound, FormAuthenticationFailed, WebClientException) as e: raise e except Exception: # TODO: remove this one or the one in get_result_single() if self.logger is not None: self.logger.exception("A general exception was thrown when executing a web request") raise finally: if client: client.close() # Return the results if we didn't use the output function if output_fx is None: return results # Otherwise the results count else: return results_count