Пример #1
0
    def handle_results(self, results, session_key, in_preview):

        # FYI: we ignore results since this is a generating command

        # Make sure that the url field was provided
        if self.url is None:
            self.logger.warn("No url was provided")
            return

        # Parse the URL
        url_field = URLField('name', 'title', 'description')
        url_parsed = url_field.to_python(self.url)

        # Do the web-ping
        result = WebPing.ping(url_parsed,
                              logger=self.logger,
                              should_contain_string=self.expected_string,
                              return_headers=self.return_headers)

        # Prep the result dictionary
        data = {
            'response_code':
            result.response_code if result.response_code > 0 else '',
            'total_time':
            round(result.request_time, 2) if result.request_time > 0 else '',
            'timed_out':
            result.timed_out,
            'url':
            result.url
        }

        # Add the MD5 of the response if available
        if result.response_md5 is not None:
            data['content_md5'] = result.response_md5

        # Add the SHA-224 of the response if available
        if result.response_sha224 is not None:
            data['content_sha224'] = result.response_sha224

        # Add the size of the response if available
        if result.response_size is not None:
            data['content_size'] = result.response_size

        # Add the variable noting if the expected string was present
        if result.has_expected_string is not None:
            data['has_expected_string'] = str(
                result.has_expected_string).lower()

        # Add the the headers if present
        if result.headers is not None:
            for header in result.headers:
                data['header_' + header] = result.headers[header]

        # Output the results
        self.output_results([data])
    def __init__(self, timeout=30):

        scheme_args = {'title': "Website Availability Check",
                       'description': "Connects to a website in order to obtain performance statistics",
                       'use_external_validation': "true",
                       'streaming_mode': "xml",
                       'use_single_instance': "true"}
        
        args = [
                Field("title", "Title", "A short description (typically just the domain name)", empty_allowed=False),
                URLField("url", "URL", "The URL to connect to (must be be either HTTP or HTTPS protocol)", empty_allowed=False),
                DurationField("interval", "Interval", "The interval defining how often to perform the check; can include time units (e.g. 15m for 15 minutes, 8h for 8 hours)", empty_allowed=False),
                Field("configuration", "Configuration", "Defines a specific proxy configuration to use (in website_monitoring.spec) if not using the default; only used if you want to have multiple proxy servers", none_allowed=True, empty_allowed=True),
                Field("client_certificate", "Client Certificate Path", "Defines the path to the client certificate (if the website requires client SSL authentication)", none_allowed=True, empty_allowed=True),
                Field("client_certificate_key", "Client Certificate Key Path", "Defines the path to the client certificate key (necessary of the key is in a separate file from the certificate)", none_allowed=True, empty_allowed=True),
                Field("username", "Username", "The username to use for authenticating (only HTTP authentication supported)", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False),
                Field("password", "Password", "The password to use for authenticating (only HTTP authentication supported)", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False)
                ]
        
        ModularInput.__init__( self, scheme_args, args, logger_name='web_availability_modular_input' )
        
        if timeout > 0:
            self.timeout = timeout
        else:
            self.timeout = 30
    def handle_results(self, results, session_key, in_preview):

        # FYI: we ignore results since this is a generating command

        # Make sure that the url field was provided
        if self.url is None:
            self.logger.warn("No url was provided")
            return

        # Parse the URL
        url_field = URLField('name','title','description')
        url_parsed = url_field.to_python(self.url)

        # Do the web-ping
        result = WebPing.ping(url_parsed, logger=self.logger, should_contain_string=self.expected_string)

        # Prep the result dictionary
        data = {
            'response_code': result.response_code if result.response_code > 0 else '',
            'total_time': round(result.request_time, 2) if result.request_time > 0 else '',
            'timed_out': result.timed_out,
            'url': result.url
        }

        # Add the MD5 of the response of available
        if result.response_md5 is not None:
            data['content_md5'] = result.response_md5

        # Add the SHA-224 of the response of available
        if result.response_sha224 is not None:
            data['content_sha224'] = result.response_sha224

        # Add the MD5 of the response of available
        if result.response_size is not None:
            data['content_size'] = result.response_size

        # Add the variable noting if the expected string was present
        if result.has_expected_string is not None:
            data['has_expected_string'] = str(result.has_expected_string).lower()

        # Output the results
        self.output_results([data])
Пример #4
0
    def __init__(self):

        scheme_args = {'title': "Syndication Feed (RSS, ATOM, RDF)",
                       'description': "Import syndication feeds (RSS, ATOM, RDF)",
                       'use_external_validation': "true",
                       'streaming_mode': "xml",
                       'use_single_instance': "true"}

        args = [
                URLField("url", "Feed URL", "The URL of the feed to input", empty_allowed=False),
                BooleanField("include_only_changed", "Include only new or changed entries", "Only include entries that has not been indexed yet (won't get items that were already observed)", empty_allowed=False),
                Field("username", "Username", "The username to use for authenticating (only HTTP authentication supported)", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False),
                Field("password", "Password", "The password to use for authenticating (only HTTP authentication supported)", none_allowed=True, empty_allowed=True, required_on_create=False, required_on_edit=False),
                DurationField("interval", "Interval", "The interval defining how often to import the feed; can include time units (e.g. 15m for 15 minutes, 8h for 8 hours)", empty_allowed=False),
                BooleanField("clean_html", "Convert HTML to Text", "Convert HTML to human readable text", empty_allowed=False)
                ]

        ModularInput.__init__( self, scheme_args, args, logger_name='syndication_modular_input' )
Пример #5
0
    def __init__(self, timeout=30, **kwargs):

        scheme_args = {
            'title': "Web-pages",
            'description': "Retrieve information from web-pages",
            'use_external_validation': "true",
            'streaming_mode': "xml",
            'use_single_instance': "true"
        }

        args = [
            Field("title",
                  "Title",
                  "A short description (typically just the domain name)",
                  empty_allowed=False),
            URLField(
                "url",
                "URL",
                "The URL to connect to (must be be either HTTP or HTTPS protocol)",
                empty_allowed=False,
                require_https_on_cloud=True),
            DurationField(
                "interval",
                "Interval",
                "The interval defining how often to perform the check; can include time units (e.g. 15m for 15 minutes, 8h for 8 hours)",
                empty_allowed=False),
            IntegerField("timeout",
                         "Timeout",
                         'The timeout (in number of seconds)',
                         none_allowed=True,
                         empty_allowed=True),
            SelectorField(
                "selector",
                "Selector",
                "A selector that will match the data you want to retrieve",
                none_allowed=True,
                empty_allowed=True),

            # HTTP client options
            Field("user_agent",
                  "User Agent",
                  "The user-agent to use when communicating with the server",
                  none_allowed=True,
                  empty_allowed=True,
                  required_on_create=False,
                  required_on_edit=False),
            Field("browser",
                  "Browser",
                  'The browser to use',
                  none_allowed=True,
                  empty_allowed=True),

            # Output options
            ListField("name_attributes",
                      "Field Name Attributes",
                      "A list of attributes to use for assigning a field name",
                      none_allowed=True,
                      empty_allowed=True,
                      required_on_create=False,
                      required_on_edit=False),
            BooleanField("use_element_name",
                         "Use Element Name as Field Name",
                         "Use the element's tag name as the field name",
                         none_allowed=True,
                         empty_allowed=True,
                         required_on_create=False,
                         required_on_edit=False),
            BooleanField("output_as_mv",
                         "Output as Multi-value Field",
                         "Output the matches as multi-value field",
                         none_allowed=True,
                         empty_allowed=True,
                         required_on_create=False,
                         required_on_edit=False),
            StaticListField("output_results",
                            "Indicates when results output should be created",
                            "Output the matches only when results changed",
                            none_allowed=True,
                            empty_allowed=True,
                            required_on_create=False,
                            required_on_edit=False,
                            valid_values=WebInput.OUTPUT_RESULTS_OPTIONS),
            BooleanField("raw_content",
                         "Raw content",
                         "Return the raw content returned by the server",
                         none_allowed=True,
                         empty_allowed=True,
                         required_on_create=False,
                         required_on_edit=False),
            BooleanField("empty_matches",
                         "Empty matches",
                         "Include empty rows (otherwise, they are excluded)",
                         none_allowed=True,
                         empty_allowed=True,
                         required_on_create=False,
                         required_on_edit=False),
            Field(
                "text_separator",
                "Text Separator",
                'A string that will be placed between the extracted values (e.g. a separator of ":" for a match against "<a>tree</a><a>frog</a>" would return "tree:frog")',
                none_allowed=True,
                empty_allowed=True),

            # Spidering options
            IntegerField(
                "page_limit",
                "Discovered page limit",
                "A limit on the number of pages that will be auto-discovered",
                none_allowed=True,
                empty_allowed=True,
                required_on_create=False,
                required_on_edit=False),
            IntegerField(
                "depth_limit",
                "Depth limit",
                "A limit on how many levels deep the search for pages will go",
                none_allowed=True,
                empty_allowed=True,
                required_on_create=False,
                required_on_edit=False),
            Field(
                "url_filter",
                "URL Filter",
                "A wild-card that will indicate which pages it should search for matches in",
                none_allowed=True,
                empty_allowed=True,
                required_on_create=False,
                required_on_edit=False),

            # Authentication options
            Field("username",
                  "Username",
                  "The username to use for authenticating",
                  none_allowed=True,
                  empty_allowed=True,
                  required_on_create=False,
                  required_on_edit=False),
            Field("password",
                  "Password",
                  "The password to use for authenticating",
                  none_allowed=True,
                  empty_allowed=True,
                  required_on_create=False,
                  required_on_edit=False),
            Field("username_field",
                  "Username field",
                  "The name of the username field on the login form",
                  none_allowed=True,
                  empty_allowed=True,
                  required_on_create=False,
                  required_on_edit=False),
            Field("password_field",
                  "Password field",
                  "The name of the password field on the login form",
                  none_allowed=True,
                  empty_allowed=True,
                  required_on_create=False,
                  required_on_edit=False),
            URLField("authentication_url",
                     "Authentication URL",
                     "The URL of the login form",
                     none_allowed=True,
                     empty_allowed=True,
                     required_on_create=False,
                     required_on_edit=False,
                     require_https_on_cloud=True)
        ]

        ModularInput.__init__(self,
                              scheme_args,
                              args,
                              logger_name='web_input_modular_input',
                              logger_level=logging.INFO)

        if timeout > 0:
            self.timeout = timeout
        else:
            self.timeout = 30
Пример #6
0
    def scrape_page(self, url, selector, name_attributes=[], output_matches_as_mv=True,
                    output_matches_as_separate_fields=False, include_empty_matches=False,
                    use_element_name=False, page_limit=1, depth_limit=50, url_filter=None,
                    include_raw_content=False, text_separator=None, browser=None,
                    additional_fields=None, match_prefix=None, empty_value='NULL',
                    https_only=False, output_fx=None):
        """
        Retrieve data from a website.
        
        Arguments:
        url -- The url to connect to. This object ought to be an instance derived from using urlparse
        selector -- A CSS selector that matches the data to retrieve
        name_attributes -- Attributes to use the values for assigning the names
        output_matches_as_mv -- Output all of the matches with the same name ("match")
        output_matches_as_separate_fields -- Output all of the matches as separate fields ("match1", "match2", etc.)
        include_empty_matches -- Output matches that result in empty strings
        use_element_name -- Use the element as the field name
        page_limit -- The page of pages to limit matches to
        depth_limit == The limit on the depth of URLs found
        url_filter -- A wild-card to limit the extracted URLs to
        include_raw_content -- Include the raw content (if true, the 'content' field will include the raw content)
        text_separator -- The content to put between each text node that matches within a given selector
        browser -- The browser to use
        additional_fields -- Additional fields to put into the result set
        match_prefix -- A prefix to attach to prepend to the front of the match fields
        empty_value -- The value to use for empty matches
        https_only -- Only extract links that use HTTPS
        output_fx -- Run this function against the results for outputting them
        """

        if isinstance(url, string_types):
            url = URLField.parse_url(url, "url")

        if isinstance(selector, string_types):
            selector = SelectorField.parse_selector(selector, "selector")

        if self.logger is not None:
            self.logger.info('Running web input, url="%s"', url.geturl())

        results = []
        results_count = 0

        client = None

        try:

            # Make the browser client if necessary
            if browser == WebScraper.FIREFOX:
                client = FirefoxClient(timeout=self.timeout, user_agent=self.user_agent, logger=self.logger)
            elif browser == WebScraper.CHROME:
                client = ChromeClient(timeout=self.timeout, user_agent=self.user_agent, logger=self.logger)
            else:
                client = DefaultWebClient(self.timeout, user_agent=self.user_agent, logger=self.logger)

            # Setup the proxy
            client.setProxy(self.proxy_type, self.proxy_server, self.proxy_port, self.proxy_user, self.proxy_password)

            # Setup credentials
            client.setCredentials(self.username, self.password)

            # Do form authentication
            if self.username is not None and self.password is not None and self.authentication_url is not None:
                client.doFormLogin(self.authentication_url.geturl(), self.username_field, self.password_field)

            # Run the scraper and get the results
            extracted_links = OrderedDict()
            extracted_links[url.geturl()] = DiscoveredURL(0)

            #if self.logger is not None:
            #    self.logger.info('Running web input, url="%s"', url.geturl())

            # Process each result
            while results_count < page_limit:

                source_url_depth = 0
                url = None
                
                for k, v in extracted_links.items():
                    
                    if v.processed == False:
                        url = k
                        source_url_depth = v.depth
                        
                        # Track that the URL was checked since we are going to process it
                        extracted_links[k].processed = True
                        
                        # Since we found one, stop looking for one to process
                        break
                
                # Stop if we have no more URLs to process
                if url is None:
                    if self.logger is not None:
                        self.logger.debug("No more URLs in the list to process")
                    break
                
                # Make the keyword argument list
                kw = {
                        'url_filter' : url_filter,
                        'source_url_depth': source_url_depth,
                        'include_raw_content': include_raw_content,
                        'text_separator': text_separator,
                        'browser': browser,
                        'extracted_links': extracted_links,
                        'match_prefix': match_prefix,
                        'empty_value': empty_value
                      }
                
                # Don't have the function extract URLs if the depth limit has been reached
                if source_url_depth >= depth_limit:
                    kw['extracted_links'] = None

                # Perform the scrape
                result = self.get_result_single(client, urlparse(url), selector,
                                                name_attributes, output_matches_as_mv,
                                                output_matches_as_separate_fields,
                                                include_empty_matches, use_element_name,
                                                additional_fields=additional_fields, **kw)
                
                # Append the result
                if result is not None:
                    if output_fx is None:
                        results.append(result)
                        results_count = len(results)
                    else:
                        output_fx(result)
                        results_count = results_count + 1

        except (LoginFormNotFound, FormAuthenticationFailed, WebClientException) as e:
            raise e

        except Exception:
            # TODO: remove this one or the one in get_result_single()
            if self.logger is not None:
                self.logger.exception("A general exception was thrown when executing a web request")
            raise

        finally:
            if client:
                client.close()

        # Return the results if we didn't use the output function 
        if output_fx is None:
            return results

        # Otherwise the results count
        else:
            return results_count