#HOMEPAGE='https://thenewboston.com/' #HOMEPAGE='http://www.health.com/' HOMEPAGE = 'http://www.hindustantimes.com/' DOMAIN_NAME = get_domain_name(HOMEPAGE) #DOMAIN_NAME='health.com/food' #DOMAIN_NAME='health.com' QUEUE_FILE = PROJECT_NAME + '/queue.txt' CRAWLED_FILE = PROJECT_NAME + '/crawled.txt' NUMBER_OF_THREADS = 100 #queue variable is basically thread queue queue = Queue() spider(PROJECT_NAME, HOMEPAGE, DOMAIN_NAME) #create worker threads (die when main exits) def create_workers(): for _ in range(NUMBER_OF_THREADS): t = threading.Thread(target=work) #print("check1") t.daemon = True # daemon so that thread dies when main exits t.start() # with t.start() thread will start executing the target, that is the work function, initially queue is empty so it will wait # do the next job in the queue def work():
self.spider = spider() self.indexer = indexer() self.parser = parser() self.urllist = urllist def start(self): if len(self.urllist) == 0: return False self.spider.addurllist(self.urllist) self.spider.setparser(self.parser) self.spider.setindexer(self.indexer) spider.run() return True def cleanup(self): self.indexer.closedb() if __name__ == "__main__": spider = spider() #spider.addurl('http://localhost:9080/setest/test.php') spider.addurl('http://hq.booksarefun.com/') parserobj = parser() indexobj = indexer() spider.setparser(parserobj) spider.setindexer(indexobj) spider.run() indexobj.closedb() print 'done!'
def __init__(self,urllist = []): self.spider = spider() self.indexer = indexer() self.parser = parser() self.urllist = urllist
#!/usr/bin/python #coding=utf-8 from Spider import spider from ActorName import ActorNameHelper import sys if __name__ == "__main__": anh = ActorNameHelper("xiami_music_artist.txt") ans = 1 id,name = anh.getName() while name != None: print str(ans) + ':\t' + id+'\t'+name crawler = spider() crawler.run(name,id) id,name = anh.getName() ans += 1 sys.stdout.flush() anh.close()
from Spider import spider if __name__ == "__main__": startURL = "http://baike.baidu.com/item/python" crawler = spider(startURL) crawler.crawl(size=10)
from Spider import spider from Mongo import Database from threading import Thread linkCount = curCount = 0 i = 0 url = "https://www.w3schools.com/" mongoData = Database("W3", linkCount, url) while (i < mongoData.linksCount()): try: spiderLeg = spider(mongoData.getNext(curCount)) curCount += 1 spiderLeg.crawl() linkCount = mongoData.insertDB(spiderLeg.linkURI, spiderLeg.texts, spiderLeg.CurLink, spiderLeg.Meta, linkCount) print "Link ", i, " Done!!" except: print "Dropped!!" i += 1
def crawl(): spider(urls=[ "http://www.shopbop.com", "http://www.kilimall.co.ke/", "http://www.jumia.co.ke" ])
import threading from queue import Queue from Spider import spider from domain import * from source import * Project_Name = '' # Name of The Directory (input) home_page = '' # Home Page of The Site You Want to Crawl (input) domain_name = get_domain(home_page) queue_file = Project_Name + '/queue.txt' crawled_file = Project_Name + '/crawled.txt' number_of_threads = 8 thread_queue = Queue() spider(home_page, Project_Name, domain_name) def create_workers(): for x in range(number_of_threads): t = threading.Thread(target=work) t.daemon = True t.start() def work(): while True: url = thread_queue.get() spider.crawl_page(threading.current_thread().name, url) thread_queue.task_done() def create_jobs(): for link in file_to_set(queue_file): thread_queue.put(link)
from Spider import spider from Database import * from threading import Thread """ Developed by: Prateek Jha, 15 May 2017 """ linkCount = curCount = 0 i = 0 url = "***************" initialize(linkCount, url) while (i < 5): spiderLeg = spider(getNext(curCount)) curCount += 1 spiderLeg.crawl() linkCount = insertDB(spiderLeg.linknText, spiderLeg.headings, url, linkCount) print "Test Completed Successfully!!" i += 1
import threading from Functions import file_to_set from queue import Queue from Spider import spider from domain import get_domain_name #Each itteration of the program is a new project PROJECT_NAME = 'Web Crawler' #Naming the current project HOMEPAGE ='https://www.reuters.com/' #Gives starting page DOMAIN_NAME = get_domain_name(HOMEPAGE) #Function is called that gets domain name QUEUE_FILE = PROJECT_NAME + '/queue.txt' CRAWLED_FILE = PROJECT_NAME + '/crawled.txt' NUMBER_OF_THREADS = 8 queue = Queue() spider(PROJECT_NAME, HOMEPAGE, DOMAIN_NAME) # First spider is called # creating worker threads def create_workers(): for _ in range(NUMBER_OF_THREADS): #itterates as many times as there are threads t = threading.Thread(target=work) t.daemon = True t.start() # Do the next job in the queue def work(): while True: url = queue.get() spider.crawl_page(threading.current_thread().name, url) #crawls page in current thread
def __init__(self, urllist=[]): self.spider = spider() self.indexer = indexer() self.parser = parser() self.urllist = urllist