def add_links_to_queue(links): for url in links: if (url in Spider.queue) or (url in Spider.crawled): continue if Spider.domain_name != get_domain_name(url): continue Spider.queue.add(url)
def gather_info(name, url): print("Scanning " + url + '\n') print("Getting Domain Name...") domain_name = get_domain_name(url) print("Done\n") print("Getting the IP Address...") ip_address = get_ip_address(domain_name) print("Done\n") print("Running Nmap Scan...") nmap = get_nmap('-F', ip_address) print("Done\n") print("Fetching robots.txt...") robots_txt = get_robots_txt(url) print("Done\n") print("Extracting whois details...") whois = get_whois(domain_name) print("Done\n") # Create Final Report in a file create_report(name, url, domain_name, nmap, robots_txt, whois) print("Information for " + name + " saved in Websites/" + name + " Folder\n")
def check_url_type(url): if Spider.sub_domain_name not in url: if "amazon.com" in domain.get_domain_name(url): return 'all-amazon' else: return 'non-amazon' return 'dev-amazon'
def gather_info(name, url): domain_name = get_domain_name(url) ip_address = get_ip_address(domain_name) nmap = get_nmap('-F', ip_address) robots_txt = get_robots_txt(url) whois = get_whois(domain_name) print("Done\n") create_report(name, nmap, robots_txt, whois) print("Information for " + name + " saved in Websites/" + name + " Folder\n")
import threading from queue import Queue from spider import Spider from domain import get_domain_name from general import file_to_set PROJECT_NAME = "elective_dummy" HOMEPAGE = "https://clbokea.github.io/exam/index.html" DOMAIN_NAME = get_domain_name(HOMEPAGE) QUEUE_FILE = PROJECT_NAME + "/queue.txt" NUMBER_OF_THREADS = 1 thread_queue = Queue() Spider(PROJECT_NAME, HOMEPAGE, DOMAIN_NAME) # Create worker threads (dies when main exits) def create_workers(): for _ in range(NUMBER_OF_THREADS): thread = threading.Thread(target=work) thread.daemon = True thread.start() # Do the next job in queue def work(): while True: url = thread_queue.get() Spider.crawl_page(threading.current_thread().name, url) thread_queue.task_done()
import threading from queue import Queue from domain import get_domain_name from spider import Spider NUM_SPIDERS = 10 HOMEPAGE = 'https://twitter.com/' DOMAIN_NAME = get_domain_name(HOMEPAGE) Spider(DOMAIN_NAME, HOMEPAGE) q = Queue() # crawl the next url def work(): while True: url = q.get() Spider.crawl_page(threading.currentThread().name, url) q.task_done() # Create spider threads (will be terminated when main exits) def create_spiders(): for x in range(NUM_SPIDERS): t = threading.Thread(target=work) t.daemon = True t.start() # Each queued link is a new job def create_jobs():
#""" #Created on Tue Aug 7 15:39:30 2018 #Web Crawler Program #@author: Haik Shougarian #"" import threading from Functions import file_to_set from queue import Queue from Spider import spider from domain import get_domain_name #Each itteration of the program is a new project PROJECT_NAME = 'Web Crawler' #Naming the current project HOMEPAGE ='https://www.reuters.com/' #Gives starting page DOMAIN_NAME = get_domain_name(HOMEPAGE) #Function is called that gets domain name QUEUE_FILE = PROJECT_NAME + '/queue.txt' CRAWLED_FILE = PROJECT_NAME + '/crawled.txt' NUMBER_OF_THREADS = 8 queue = Queue() spider(PROJECT_NAME, HOMEPAGE, DOMAIN_NAME) # First spider is called # creating worker threads def create_workers(): for _ in range(NUMBER_OF_THREADS): #itterates as many times as there are threads t = threading.Thread(target=work) t.daemon = True t.start()