def __check_programming_language(self, url): """ Description: ============ This method will try its level best to get the name of the programming language used to build the website. Notes: ====== This method will heavily used URL class from url package :return: """ self.__thread_semaphore.acquire() print("[+] ANALYSING PROGRAMMING LANGUAGE") # These are the popular programming languages used for designing websites language_names = { ".php": "PHP", ".jsp": "JSP", ".asp": "ASP", ".aspx": "ASPX", ".py": "PYTHON", ".pl": "PERL" } user_agent = UserAgent.get_user_agent() r = URL().get_request(url=url, user_agent=user_agent) if r is not None: soup = BeautifulSoup(r.content, "html.parser") for i in soup.find_all("a"): try: partial_url = i.get("href") if "http" not in partial_url: new_url = URL.join_urls(url, partial_url) else: new_url = partial_url if URL.is_same_domain( url, new_url) else "" file_name = URL.get_file_name(new_url) for i in language_names: if i in file_name: self.__programming_language_used = language_names[ i] # Now we will update the programming language used into the database InfoGatheringPhaseOneDatabase.update_programming_language( self.__database_semaphore, self.__connection, self.__project_id, self.__programming_language_used) break if i in file_name: break except Exception: pass self.__thread_semaphore.release()
def crawl(self, url): """ Description: ------------ This will crawl the urls completely :param url: The url to be crawled :return: None """ start_time = time.time() r = URL().get_request(url=url, user_agent=UserAgent.get_user_agent()) end_time = time.time() total_time = end_time - start_time self.__bob_object.predict(total_time) if r is not None: soup = BeautifulSoup(r.content, "html.parser") # At this stage we have got the beautiful soup objects #First find all the href links for i in soup.find_all("a"): try: partial_url = i.get("href") url_to_be_scanned = None # we will scan this urls # Check if the partial url is actually a partial url if "http" in partial_url: if URL.is_same_domain(self.__base_url, partial_url): if partial_url not in self.__crawled_urls: self.__urls.put(partial_url) self.__crawled_urls.append(partial_url) url_to_be_scanned = partial_url else: full_url = URL.join_urls(self.__base_url, partial_url) if full_url not in self.__crawled_urls: self.__urls.put(full_url) self.__crawled_urls.append(full_url) url_to_be_scanned = full_url # run a simple scan in the url if url_to_be_scanned is not None: print("[i] CURRENTLY SCANNING [GET]: ", url_to_be_scanned) # Make the scanning as a new process SimpleScan( project_id=self.__project_id, thread_semaphore=self.__thread_semaphore, database_semaphore=self.__database_semaphore, url=url_to_be_scanned, connection=self.__connection, poc_object=self.__poc_object) except Exception as e: print("[-] EXCEPTION OCCURED ", e) while not self.__urls.empty(): self.crawl(self.__urls.get())