def _get_web_page_from_db(self, current_session, page_id=None, url=None, page=None): if page is None: if page_id is not None: page = self.pages.find_one({"session": current_session, "web_page_id": page_id }) elif url is not None: page = self.pages.find_one({"session": current_session, "url": url}) else: raise AttributeError("You must specifies either page_id or url") if page is None: return None clickables = self.get_all_clickables_to_page_id_from_db(current_session, page['web_page_id']) forms = self.get_all_forms_to_page_id_from_db(current_session, page['web_page_id']) result = WebPage(page['web_page_id'], page['url'], page['html'], None, page['current_depth'], page['base_url']) result.clickables = clickables result.forms = forms links = [] for link in page['links']: links.append(self._parse_link_from_db(link)) result.links = links timemimg_requests = [] for request in page['timing_requests']: timemimg_requests.append(self.get_asyncrequest_to_id(current_session, request)) result.timing_requests = timemimg_requests ajax = [] for request in page['ajax_requests']: ajax.append(self.get_asyncrequest_to_id(current_session, request)) result.ajax_requests = ajax return result
def _get_web_page_from_db(self, current_session, page_id=None, url=None, page=None): if page is None: if page_id is not None: page = self.pages.find_one({ "session": current_session, "web_page_id": page_id }) elif url is not None: page = self.pages.find_one({ "session": current_session, "url": url }) else: raise AttributeError( "You must specifies either page_id or url") if page is None: return None clickables = self.get_all_clickables_to_page_id_from_db( current_session, page['web_page_id']) forms = self.get_all_forms_to_page_id_from_db(current_session, page['web_page_id']) result = WebPage(page['web_page_id'], page['url'], page['html'], None, page['current_depth'], page['base_url']) result.clickables = clickables result.forms = forms links = [] for link in page['links']: links.append(self._parse_link_from_db(link)) result.links = links timemimg_requests = [] for request in page['timing_requests']: timemimg_requests.append( self.get_asyncrequest_to_id(current_session, request)) result.timing_requests = timemimg_requests ajax = [] for request in page['ajax_requests']: ajax.append(self.get_asyncrequest_to_id(current_session, request)) result.ajax_requests = ajax return result
def _login_and_return_webpage(self, login_form, page_with_login_form=None, login_data=None, login_clickable=None): if page_with_login_form is None: page_with_login_form = self._page_with_loginform_logged_out try: if login_clickable is not None: tmp_page = deepcopy(page_with_login_form) event_state, page_with_login_form = self._event_executor.execute( tmp_page, element_to_click=login_clickable) if event_state == EventResult.ErrorWhileInitialLoading: sleep(2000) event_state, page_with_login_form = self._event_executor.execute( tmp_page, element_to_click=login_clickable) if event_state == EventResult.ErrorWhileInitialLoading: logging.debug( "Two time executing fails.. stop crawling") return None self.domain_handler.complete_urls_in_page(page_with_login_form) self.domain_handler.analyze_urls(page_with_login_form) self.async_request_handler.handle_requests( page_with_login_form) logging.debug("Start submitting login form...") response_code, html_after_timeouts, new_clickables, forms, links, timemimg_requests = self._form_handler.submit_form( login_form, page_with_login_form, login_data) except ValueError: return None #TODO: Put building of Webpage inside submit function page_after_login = WebPage(-1, page_with_login_form.url, html_after_timeouts) page_after_login.clickables = new_clickables page_after_login.links = links page_after_login.timing_requests = timemimg_requests page_after_login.forms = forms self.domain_handler.complete_urls_in_page(page_after_login) self.domain_handler.analyze_urls(page_after_login) self.async_request_handler.handle_requests(page_after_login) return page_after_login
def analyze(self, url_to_request, timeout=10, current_depth=None, method="GET", data={}): try: url_to_request = url_to_request.toString() except AttributeError: url_to_request = url_to_request logging.debug("Start analyzing the url {}...".format(url_to_request)) self._timing_requests = [] self._new_clickables = [] self._timeming_events = [] self._current_timeming_event = None self._loading_complete = False self._analyzing_finished = False self.response_code = {} if method == "GET": self.mainFrame().load(QUrl(url_to_request)) else: request = self.make_request(url_to_request) data = self.post_data_to_array(data) request.setRawHeader("Content-Type", QByteArray("application/x-www-form-urlencoded")) self.mainFrame().load(request, QNetworkAccessManager.PostOperation, data) t = 0 while not self._loading_complete and t < timeout: # Waiting for finish processing self._wait(self.wait_for_processing) t += self.wait_for_processing videos = self.mainFrame().findAllElements("video") if len(videos) > 0: logging.debug("{} videos found... removing them") for video in videos: video.removeFromDocument() overall_waiting_time = t buffer = 250 while len(self._timeming_events) > 0 and overall_waiting_time < timeout: self._current_timeming_event = self._timeming_events.pop(0) # Take the first event(ordered by needed time self._waiting_for = self._current_timeming_event["event_type"] # Setting kind of event waiting_time_in_milliseconds = ( self._current_timeming_event["time"] - overall_waiting_time ) # Taking waiting time and convert it from milliseconds to seconds waiting_time_in_milliseconds = (waiting_time_in_milliseconds + buffer) / 1000.0 if waiting_time_in_milliseconds < 0.0: waiting_time_in_milliseconds = 0 self._wait(waiting_time_in_milliseconds) # Waiting for 100 millisecond before expected event overall_waiting_time += waiting_time_in_milliseconds if overall_waiting_time < 0.5: self._wait((0.5 - overall_waiting_time)) # Just for debugging # f = open("text.txt", "w") # f.write(self.mainFrame().toHtml()) # f.close() base_url = self.mainFrame().findFirstElement("base") if base_url is not None: base_url = base_url.attribute("href") links, clickables = extract_links(self.mainFrame(), url_to_request) forms = extract_forms(self.mainFrame()) elements_with_event_properties = property_helper(self.mainFrame()) self.mainFrame().evaluateJavaScript(self._property_obs_js) self._wait(0.1) self._analyzing_finished = True html_after_timeouts = self.mainFrame().toHtml() response_url = self.mainFrame().url().toString() self.mainFrame().setHtml(None) self._new_clickables.extend(clickables) self._new_clickables.extend(elements_with_event_properties) self._new_clickables = purge_dublicates(self._new_clickables) response_code = None try: response_code = self.response_code[url_to_request] except KeyError: response_code = 200 if response_code is None: response_code = 200 try: current_page = WebPage(self.parent().get_next_page_id(), response_url, html_after_timeouts) except AttributeError: # Attacker don't need this function... current_page = WebPage(42, response_url, html_after_timeouts) current_page.timing_requests = self._timing_requests current_page.clickables = self._new_clickables current_page.links = links current_page.forms = forms if base_url is not None and base_url != "": current_page.base_url = base_url return response_code, current_page
def analyze(self, url_to_request, timeout=10, current_depth=None, method="GET", data={}): try: url_to_request = url_to_request.toString() except AttributeError: url_to_request = url_to_request logging.debug("Start analyzing the url {}...".format(url_to_request)) self._timing_requests = [] self._new_clickables = [] self._timeming_events = [] self._current_timeming_event = None self._loading_complete = False self._analyzing_finished = False self.response_code = {} if method == "GET": self.mainFrame().load(QUrl(url_to_request)) else: request = self.make_request(url_to_request) data = self.post_data_to_array(data) request.setRawHeader( 'Content-Type', QByteArray('application/x-www-form-urlencoded')) self.mainFrame().load(request, QNetworkAccessManager.PostOperation, data) t = 0 while (not self._loading_complete and t < timeout): # Waiting for finish processing self._wait(self.wait_for_processing) t += self.wait_for_processing videos = self.mainFrame().findAllElements("video") if len(videos) > 0: logging.debug("{} videos found... removing them") for video in videos: video.removeFromDocument() overall_waiting_time = t buffer = 250 while len( self._timeming_events) > 0 and overall_waiting_time < timeout: self._current_timeming_event = self._timeming_events.pop( 0) # Take the first event(ordered by needed time self._waiting_for = self._current_timeming_event[ 'event_type'] # Setting kind of event waiting_time_in_milliseconds = ( self._current_timeming_event["time"] - overall_waiting_time ) # Taking waiting time and convert it from milliseconds to seconds waiting_time_in_milliseconds = ( (waiting_time_in_milliseconds + buffer) / 1000.0) if waiting_time_in_milliseconds < 0.0: waiting_time_in_milliseconds = 0 self._wait(waiting_time_in_milliseconds ) # Waiting for 100 millisecond before expected event overall_waiting_time += waiting_time_in_milliseconds if overall_waiting_time < 0.5: self._wait((0.5 - overall_waiting_time)) # Just for debugging #f = open("text.txt", "w", encoding="utf-8") #f.write(self.mainFrame().toHtml()) #f.close() base_url = self.mainFrame().findFirstElement("base") if base_url is not None: base_url = base_url.attribute("href") links, clickables = extract_links(self.mainFrame(), url_to_request) forms = extract_forms(self.mainFrame()) elements_with_event_properties = property_helper(self.mainFrame()) self.mainFrame().evaluateJavaScript(self._property_obs_js) self._wait(0.1) self._analyzing_finished = True html_after_timeouts = self.mainFrame().toHtml() response_url = self.mainFrame().url().toString() self.mainFrame().setHtml(None) self._new_clickables.extend(clickables) self._new_clickables.extend(elements_with_event_properties) self._new_clickables = purge_dublicates(self._new_clickables) response_code = None try: response_code = self.response_code[url_to_request] except KeyError: response_code = 200 if response_code is None: response_code = 200 try: current_page = WebPage(self.parent().get_next_page_id(), response_url, html_after_timeouts) except AttributeError: #Attacker don't need this function... current_page = WebPage(42, response_url, html_after_timeouts) current_page.timing_requests = self._timing_requests current_page.clickables = self._new_clickables current_page.links = links current_page.forms = forms if base_url is not None and base_url != "": current_page.base_url = base_url return response_code, current_page