Пример #1
0
    def __init__(self):
        Config.__init__(self)
        
        self.UserAgentString = "UCI Inf141-CS121 crawler 24427400 59359881 33062456 62838370"
        if os.path.exists('count.txt'):
		with open('count.txt','r') as file:
			self.count = int(file.readline())
	else:
		self.count = 0
        self.PolitenessDelay = 1200
Пример #2
0
 def __init__(self):
     Config.__init__(self)
     self.UserAgentString = "lordnahor-libseek-MSR-app"
     self.MaxWorkerThreads = 8
     self.DepthFirstTraversal = True
     self.FrontierTimeOut = 100
     self.WorkerTimeOut = 100
     self.OutBufferTimeOut = 100
     self.PolitenessDelay = 1000
     self.MaxPageSize = 1048576*5
     self.IgnoreRobotRule = True
     self.urlToNameMap = shelve.open("urlDataPersist.shelve")
Пример #3
0
    def __init__(self):
        Config.__init__(self)
        self.UserAgentString = "UCI Inf141-CS121 crawler 63393716 32393047 22863530 82181685"
        self.PolitenessDelay = 600

        #Timeout(Seconds) for trying to get the next url from the frontier. 
        self.FrontierTimeOut = 60

        #Timeout(Seconds) for trying to get a free worker thread, (worker is taking too long maybe?)
        self.WorkerTimeOut = 60

        #Timeout(Seconds) for getting data from the output queue
        self.OutBufferTimeOut = 60

        self.MaxQueueSize = 100

        self.urlValidator = UrlValidator(verbose=False)
        self.dbConf = open('db.conf').read()
        self.conn = self.connectDatabase()
        print "Using Postgres shelve implementation..."
        self.PersistenceObject = NetShelve.PgShelve(self.connectDatabase())
Пример #4
0
 def __init__(self):
     Config.__init__(self)
     self.UserAgentString = "Set This Value!"
Пример #5
0
 def __init__(self):
     Config.__init__(self)
     self.UserAgentString = "IR W16 WebCrawler 85686586 42686317 79403075"
Пример #6
0
 def __init__(self):
     Config.__init__(self)
     self.UserAgentString = "IR W16 WebCrawler 75307532_92707006_48565650"
     self.MaxWorkerThreads = 12
 def __init__(self):
     Config.__init__(self)
     self.UserAgentString = (
         "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
     )
Пример #8
0
 def __init__(self):
     Config.__init__(self)
     self.UserAgentString = "Set This Value!"
Пример #9
0
 def __init__(self):
     Config.__init__(self)
     self.UserAgentString = "UCI Inf141-CS121 crawler 33819914"
Пример #10
0
 def __init__(self):
     Config.__init__(self)
     self.UserAgentString = "IR W16 WebCrawler 75307532_92707006_48565650"
     self.MaxWorkerThreads = 12
Пример #11
0
 def __init__(self):
     Config.__init__(self)
     self.PolitenessDelay = 1000
     #self.MaxDepth = 4
     self.UserAgentString = "UCI Inf141-CS121 crawler 67995387 90117275 14971857"
Пример #12
0
 def __init__(self):
     Config.__init__(self)
     self.UserAgentString = "UCI Inf141-CS121 crawler 33819914"
Пример #13
0
 def __init__(self):
     Config.__init__(self)
     self.UserAgentString = "INF141 <28859606 76439804 50233729 71903006>"
Пример #14
0
 def __init__(self):
     Config.__init__(self)
     self.UserAgentString = "UCI Inf141-CS121 crawler ratkins1"
     self.log = "log.txt"
     self.contentLog = "content.txt"
     self.content = defaultdict(dict)