示例#1
0
 def __preprocess(self):
     print("[+] PRE-PROCESSING ROBOTS.TXT")
     self.__thread_semaphore.acquire(timeout=10)
     robots_file = open(self.__robots_path, "r")
     robots_file_contents = robots_file.readlines()
     for content in robots_file_contents:
         content = content.replace("\n", "")
         content = content.strip()
         try:
             # If it is a comment
             if content[0] == "#":
                 SivaDB.update_raw_info(connection=self.__connection, project_id=self.__project_id,
                                        info_source="robots.txt", information=content,
                                        database_semaphore=self.__database_semaphore)
             else:
                 if "U" in content[0]:
                     self.__robots_preprocessed_file.write(content+"\n")
                 else:
                     full_content = ""  # Minor fix @v0.2
                     if "D" in content[0]:
                         content = content.replace("Disallow:", "")
                         content = content.strip()
                         full_url = URL.join_urls(self.__url, content)
                         full_content = "Disallow: "+ full_url
                     elif "U" in content[0]:
                         content = content.replace("Allow:", "")
                         content = content.strip()
                         full_url = URL.join_urls(self.__url, content)
                         full_content = "Allow: "+full_url
                     self.__robots_preprocessed_file.write(full_content + "\n")
         except IndexError:
             self.__robots_preprocessed_file.write("\n")
     self.__thread_semaphore.release()
 def __init__(self, project_id, url, thread_semaphore, database_semaphore,
              connection):
     """
     :param project_id: The id of the project
     :param url: The website for which the administrator page is to be found
     :param thread_semaphore:
     :param database_semaphore:
     """
     self.__project_id = project_id
     self.__url = url
     self.__thread_semaphore = thread_semaphore
     self.__database_semaphore = database_semaphore
     self.__connection = connection
     admin_contents = File.read_to_list("admin.txt")
     for admin_page in tqdm(admin_contents, ncols=100):
         self.__thread_semaphore.acquire()
         admin_url = URL.join_urls(self.__url, admin_page)
         t = Thread(target=self.add_if_page_found, args=(admin_url, ))
         t.start()
     print("[+] WAITING FOR THE THREADS TO COMPLETE THEIR TASKS")
     for thread in self.__threads:
         if thread.is_alive():
             thread.join()
     # Now display and add the admin pages in database table named "admin_table"
     for admin_page in self.__admin_pages:
         print("[+] ADMIN PAGE: ", admin_page)
         self.update_admin_page(
             project_id=project_id,
             url=admin_page,
             connection=self.__connection,
             database_semaphore=self.__database_semaphore)
示例#3
0
 def brutforce(self):
     self.__browser = webdriver.PhantomJS(Static.phantomjs)
     for partial_url in self.__keywords:
         new_url = URL.join_urls(self.__website, partial_url)
         self.__browser.get(new_url)
         print(self.__browser.current_url)
         print(self.__browser.get_log("har"))
     self.__browser.quit()
示例#4
0
 def __check_programming_language(self, url):
     """
     Description:
     ============
     This method will try its level best to get the name of the programming
     language used to build the website.
     Notes:
     ======
     This method will heavily used URL class from url package
     :return:
     """
     self.__thread_semaphore.acquire()
     print("[+] ANALYSING PROGRAMMING LANGUAGE")
     # These are the popular programming languages used for designing websites
     language_names = {
         ".php": "PHP",
         ".jsp": "JSP",
         ".asp": "ASP",
         ".aspx": "ASPX",
         ".py": "PYTHON",
         ".pl": "PERL"
     }
     user_agent = UserAgent.get_user_agent()
     r = URL().get_request(url=url, user_agent=user_agent)
     if r is not None:
         soup = BeautifulSoup(r.content, "html.parser")
         for i in soup.find_all("a"):
             try:
                 partial_url = i.get("href")
                 if "http" not in partial_url:
                     new_url = URL.join_urls(url, partial_url)
                 else:
                     new_url = partial_url if URL.is_same_domain(
                         url, new_url) else ""
                 file_name = URL.get_file_name(new_url)
                 for i in language_names:
                     if i in file_name:
                         self.__programming_language_used = language_names[
                             i]
                         # Now we will update the programming language used into the database
                         InfoGatheringPhaseOneDatabase.update_programming_language(
                             self.__database_semaphore, self.__connection,
                             self.__project_id,
                             self.__programming_language_used)
                         break
                     if i in file_name:
                         break
             except Exception:
                 pass
     self.__thread_semaphore.release()
示例#5
0
 def crawl(self, url):
     """
     Description:
     ------------
     This will crawl the urls completely
     :param url: The url to be crawled
     :return: None
     """
     start_time = time.time()
     r = URL().get_request(url=url, user_agent=UserAgent.get_user_agent())
     end_time = time.time()
     total_time = end_time - start_time
     self.__bob_object.predict(total_time)
     if r is not None:
         soup = BeautifulSoup(r.content, "html.parser")
         # At this stage we have got the beautiful soup objects
         #First find all the href links
         for i in soup.find_all("a"):
             try:
                 partial_url = i.get("href")
                 url_to_be_scanned = None  # we will scan this urls
                 # Check if the partial url is actually a partial url
                 if "http" in partial_url:
                     if URL.is_same_domain(self.__base_url, partial_url):
                         if partial_url not in self.__crawled_urls:
                             self.__urls.put(partial_url)
                             self.__crawled_urls.append(partial_url)
                             url_to_be_scanned = partial_url
                 else:
                     full_url = URL.join_urls(self.__base_url, partial_url)
                     if full_url not in self.__crawled_urls:
                         self.__urls.put(full_url)
                         self.__crawled_urls.append(full_url)
                         url_to_be_scanned = full_url
                 # run a simple scan in the url
                 if url_to_be_scanned is not None:
                     print("[i] CURRENTLY SCANNING [GET]: ",
                           url_to_be_scanned)
                     # Make the scanning as a new process
                     SimpleScan(
                         project_id=self.__project_id,
                         thread_semaphore=self.__thread_semaphore,
                         database_semaphore=self.__database_semaphore,
                         url=url_to_be_scanned,
                         connection=self.__connection,
                         poc_object=self.__poc_object)
             except Exception as e:
                 print("[-] EXCEPTION OCCURED ", e)
     while not self.__urls.empty():
         self.crawl(self.__urls.get())
 def __get_robots(self):
     """
     Description:
     ------------
     This method is used to get the robots.txt file from the remote server
     :return:
     """
     self.__thread_semaphore.acquire()
     robots_url = URL.join_urls(self.__url, "/robots.txt")
     print("[+] GETTING ROBOTS.TXT AT ", robots_url)
     r = URL().get_head_request(url=self.__url,
                                user_agent=UserAgent.get_user_agent())
     if r is not None:
         if r.status_code == 200:
             robots_file_location = "projects/project-" + str(
                 self.__project_id) + "/robots.txt"
             File.download_file(local_file_location=robots_file_location,
                                remote_file_location=robots_url)
         else:
             print("[-] NO robots.txt FOUND IN THE SERVER")
     self.__thread_semaphore.release()