def __preprocess(self): print("[+] PRE-PROCESSING ROBOTS.TXT") self.__thread_semaphore.acquire(timeout=10) robots_file = open(self.__robots_path, "r") robots_file_contents = robots_file.readlines() for content in robots_file_contents: content = content.replace("\n", "") content = content.strip() try: # If it is a comment if content[0] == "#": SivaDB.update_raw_info(connection=self.__connection, project_id=self.__project_id, info_source="robots.txt", information=content, database_semaphore=self.__database_semaphore) else: if "U" in content[0]: self.__robots_preprocessed_file.write(content+"\n") else: full_content = "" # Minor fix @v0.2 if "D" in content[0]: content = content.replace("Disallow:", "") content = content.strip() full_url = URL.join_urls(self.__url, content) full_content = "Disallow: "+ full_url elif "U" in content[0]: content = content.replace("Allow:", "") content = content.strip() full_url = URL.join_urls(self.__url, content) full_content = "Allow: "+full_url self.__robots_preprocessed_file.write(full_content + "\n") except IndexError: self.__robots_preprocessed_file.write("\n") self.__thread_semaphore.release()
def __init__(self, project_id, url, thread_semaphore, database_semaphore, connection): """ :param project_id: The id of the project :param url: The website for which the administrator page is to be found :param thread_semaphore: :param database_semaphore: """ self.__project_id = project_id self.__url = url self.__thread_semaphore = thread_semaphore self.__database_semaphore = database_semaphore self.__connection = connection admin_contents = File.read_to_list("admin.txt") for admin_page in tqdm(admin_contents, ncols=100): self.__thread_semaphore.acquire() admin_url = URL.join_urls(self.__url, admin_page) t = Thread(target=self.add_if_page_found, args=(admin_url, )) t.start() print("[+] WAITING FOR THE THREADS TO COMPLETE THEIR TASKS") for thread in self.__threads: if thread.is_alive(): thread.join() # Now display and add the admin pages in database table named "admin_table" for admin_page in self.__admin_pages: print("[+] ADMIN PAGE: ", admin_page) self.update_admin_page( project_id=project_id, url=admin_page, connection=self.__connection, database_semaphore=self.__database_semaphore)
def brutforce(self): self.__browser = webdriver.PhantomJS(Static.phantomjs) for partial_url in self.__keywords: new_url = URL.join_urls(self.__website, partial_url) self.__browser.get(new_url) print(self.__browser.current_url) print(self.__browser.get_log("har")) self.__browser.quit()
def __check_programming_language(self, url): """ Description: ============ This method will try its level best to get the name of the programming language used to build the website. Notes: ====== This method will heavily used URL class from url package :return: """ self.__thread_semaphore.acquire() print("[+] ANALYSING PROGRAMMING LANGUAGE") # These are the popular programming languages used for designing websites language_names = { ".php": "PHP", ".jsp": "JSP", ".asp": "ASP", ".aspx": "ASPX", ".py": "PYTHON", ".pl": "PERL" } user_agent = UserAgent.get_user_agent() r = URL().get_request(url=url, user_agent=user_agent) if r is not None: soup = BeautifulSoup(r.content, "html.parser") for i in soup.find_all("a"): try: partial_url = i.get("href") if "http" not in partial_url: new_url = URL.join_urls(url, partial_url) else: new_url = partial_url if URL.is_same_domain( url, new_url) else "" file_name = URL.get_file_name(new_url) for i in language_names: if i in file_name: self.__programming_language_used = language_names[ i] # Now we will update the programming language used into the database InfoGatheringPhaseOneDatabase.update_programming_language( self.__database_semaphore, self.__connection, self.__project_id, self.__programming_language_used) break if i in file_name: break except Exception: pass self.__thread_semaphore.release()
def crawl(self, url): """ Description: ------------ This will crawl the urls completely :param url: The url to be crawled :return: None """ start_time = time.time() r = URL().get_request(url=url, user_agent=UserAgent.get_user_agent()) end_time = time.time() total_time = end_time - start_time self.__bob_object.predict(total_time) if r is not None: soup = BeautifulSoup(r.content, "html.parser") # At this stage we have got the beautiful soup objects #First find all the href links for i in soup.find_all("a"): try: partial_url = i.get("href") url_to_be_scanned = None # we will scan this urls # Check if the partial url is actually a partial url if "http" in partial_url: if URL.is_same_domain(self.__base_url, partial_url): if partial_url not in self.__crawled_urls: self.__urls.put(partial_url) self.__crawled_urls.append(partial_url) url_to_be_scanned = partial_url else: full_url = URL.join_urls(self.__base_url, partial_url) if full_url not in self.__crawled_urls: self.__urls.put(full_url) self.__crawled_urls.append(full_url) url_to_be_scanned = full_url # run a simple scan in the url if url_to_be_scanned is not None: print("[i] CURRENTLY SCANNING [GET]: ", url_to_be_scanned) # Make the scanning as a new process SimpleScan( project_id=self.__project_id, thread_semaphore=self.__thread_semaphore, database_semaphore=self.__database_semaphore, url=url_to_be_scanned, connection=self.__connection, poc_object=self.__poc_object) except Exception as e: print("[-] EXCEPTION OCCURED ", e) while not self.__urls.empty(): self.crawl(self.__urls.get())
def __get_robots(self): """ Description: ------------ This method is used to get the robots.txt file from the remote server :return: """ self.__thread_semaphore.acquire() robots_url = URL.join_urls(self.__url, "/robots.txt") print("[+] GETTING ROBOTS.TXT AT ", robots_url) r = URL().get_head_request(url=self.__url, user_agent=UserAgent.get_user_agent()) if r is not None: if r.status_code == 200: robots_file_location = "projects/project-" + str( self.__project_id) + "/robots.txt" File.download_file(local_file_location=robots_file_location, remote_file_location=robots_url) else: print("[-] NO robots.txt FOUND IN THE SERVER") self.__thread_semaphore.release()