示例#1
0
 def add_links_to_queue(links):
     for url in links:
         if (url in Spider.queue) or (url in Spider.crawled):
             continue
         if Spider.domain_name != get_domain_name(url):
             continue
         Spider.queue.add(url)
示例#2
0
def gather_info(name, url):
    print("Scanning " + url + '\n')

    print("Getting Domain Name...")
    domain_name = get_domain_name(url)
    print("Done\n")

    print("Getting the IP Address...")
    ip_address = get_ip_address(domain_name)
    print("Done\n")

    print("Running Nmap Scan...")
    nmap = get_nmap('-F', ip_address)
    print("Done\n")

    print("Fetching robots.txt...")
    robots_txt = get_robots_txt(url)
    print("Done\n")

    print("Extracting whois details...")
    whois = get_whois(domain_name)
    print("Done\n")

    # Create Final Report in a file
    create_report(name, url, domain_name, nmap, robots_txt, whois)
    print("Information for " + name + " saved in Websites/" + name +
          " Folder\n")
def gather_info(name, url):
    print("Scanning " + url + '\n')

    print("Getting Domain Name...")
    domain_name = get_domain_name(url)
    print("Done\n")

    print("Getting the IP Address...")
    ip_address = get_ip_address(domain_name)
    print("Done\n")

    print("Running Nmap Scan...")
    nmap = get_nmap('-F', ip_address)
    print("Done\n")

    print("Fetching robots.txt...")
    robots_txt = get_robots_txt(url)
    print("Done\n")

    print("Extracting whois details...")
    whois = get_whois(domain_name)
    print("Done\n")

    # Create Final Report in a file
    create_report(name, url, domain_name, nmap, robots_txt, whois)
    print("Information for " + name + " saved in Websites/" + name + " Folder\n")
示例#4
0
 def check_url_type(url):
     if Spider.sub_domain_name not in url:
         if "amazon.com" in domain.get_domain_name(url):
             return 'all-amazon'
         else:
             return 'non-amazon'
     return 'dev-amazon'
示例#5
0
def gather_info(name, url):

    domain_name = get_domain_name(url)
    ip_address = get_ip_address(domain_name)
    nmap = get_nmap('-F', ip_address)
    robots_txt = get_robots_txt(url)
    whois = get_whois(domain_name)
    print("Done\n")

    create_report(name, nmap, robots_txt, whois)
    print("Information for " + name + " saved in Websites/" + name +
          " Folder\n")
示例#6
0
import threading
from queue import Queue
from spider import Spider
from domain import get_domain_name
from general import file_to_set

PROJECT_NAME = "elective_dummy"
HOMEPAGE = "https://clbokea.github.io/exam/index.html"
DOMAIN_NAME = get_domain_name(HOMEPAGE)
QUEUE_FILE = PROJECT_NAME + "/queue.txt"
NUMBER_OF_THREADS = 1
thread_queue = Queue()
Spider(PROJECT_NAME, HOMEPAGE, DOMAIN_NAME)


# Create worker threads (dies when main exits)
def create_workers():
    for _ in range(NUMBER_OF_THREADS):
        thread = threading.Thread(target=work)
        thread.daemon = True
        thread.start()


# Do the next job in queue
def work():
    while True:
        url = thread_queue.get()
        Spider.crawl_page(threading.current_thread().name, url)
        thread_queue.task_done()

示例#7
0
import threading
from queue import Queue
from domain import get_domain_name
from spider import Spider


NUM_SPIDERS = 10
HOMEPAGE = 'https://twitter.com/'
DOMAIN_NAME = get_domain_name(HOMEPAGE)
Spider(DOMAIN_NAME, HOMEPAGE)
q = Queue()


# crawl the next url
def work():
    while True:
        url = q.get()
        Spider.crawl_page(threading.currentThread().name, url)
        q.task_done()


# Create spider threads (will be terminated when main exits)
def create_spiders():
    for x in range(NUM_SPIDERS):
        t = threading.Thread(target=work)
        t.daemon = True
        t.start()


# Each queued link is a new job
def create_jobs():
示例#8
0
#"""
#Created on Tue Aug  7 15:39:30 2018

#Web Crawler Program
#@author: Haik Shougarian
#""
import threading
from Functions import file_to_set
from queue import Queue
from Spider import spider
from domain import get_domain_name

#Each itteration of the program is a new project
PROJECT_NAME = 'Web Crawler'  #Naming the current project
HOMEPAGE ='https://www.reuters.com/' #Gives starting page
DOMAIN_NAME = get_domain_name(HOMEPAGE) #Function is called that gets domain name
QUEUE_FILE = PROJECT_NAME + '/queue.txt' 
CRAWLED_FILE = PROJECT_NAME + '/crawled.txt'
NUMBER_OF_THREADS = 8

queue = Queue()
spider(PROJECT_NAME, HOMEPAGE, DOMAIN_NAME)  # First spider is called 


# creating worker threads
def create_workers():
    for _ in range(NUMBER_OF_THREADS): #itterates as many times as there are threads
        t = threading.Thread(target=work) 
        t.daemon = True
        t.start()