Exemplo n.º 1
0
    def __init__(self):
        Config.__init__(self)
        
        self.UserAgentString = "UCI Inf141-CS121 crawler 24427400 59359881 33062456 62838370"
        if os.path.exists('count.txt'):
		with open('count.txt','r') as file:
			self.count = int(file.readline())
	else:
		self.count = 0
        self.PolitenessDelay = 1200
Exemplo n.º 2
0
 def __init__(self):
     Config.__init__(self)
     self.UserAgentString = "lordnahor-libseek-MSR-app"
     self.MaxWorkerThreads = 8
     self.DepthFirstTraversal = True
     self.FrontierTimeOut = 100
     self.WorkerTimeOut = 100
     self.OutBufferTimeOut = 100
     self.PolitenessDelay = 1000
     self.MaxPageSize = 1048576*5
     self.IgnoreRobotRule = True
     self.urlToNameMap = shelve.open("urlDataPersist.shelve")
Exemplo n.º 3
0
    def __init__(self):
        Config.__init__(self)
        self.UserAgentString = "UCI Inf141-CS121 crawler 63393716 32393047 22863530 82181685"
        self.PolitenessDelay = 600

        #Timeout(Seconds) for trying to get the next url from the frontier. 
        self.FrontierTimeOut = 60

        #Timeout(Seconds) for trying to get a free worker thread, (worker is taking too long maybe?)
        self.WorkerTimeOut = 60

        #Timeout(Seconds) for getting data from the output queue
        self.OutBufferTimeOut = 60

        self.MaxQueueSize = 100

        self.urlValidator = UrlValidator(verbose=False)
        self.dbConf = open('db.conf').read()
        self.conn = self.connectDatabase()
        print "Using Postgres shelve implementation..."
        self.PersistenceObject = NetShelve.PgShelve(self.connectDatabase())
Exemplo n.º 4
0
 def __init__(self):
     Config.__init__(self)
     self.UserAgentString = "Set This Value!"
Exemplo n.º 5
0
 def __init__(self):
     Config.__init__(self)
     self.UserAgentString = "IR W16 WebCrawler 85686586 42686317 79403075"
Exemplo n.º 6
0
 def __init__(self):
     Config.__init__(self)
     self.UserAgentString = "IR W16 WebCrawler 75307532_92707006_48565650"
     self.MaxWorkerThreads = 12
 def __init__(self):
     Config.__init__(self)
     self.UserAgentString = (
         "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
     )
Exemplo n.º 8
0
 def __init__(self):
     Config.__init__(self)
     self.UserAgentString = "Set This Value!"
Exemplo n.º 9
0
 def __init__(self):
     Config.__init__(self)
     self.UserAgentString = "UCI Inf141-CS121 crawler 33819914"
Exemplo n.º 10
0
 def __init__(self):
     Config.__init__(self)
     self.UserAgentString = "IR W16 WebCrawler 75307532_92707006_48565650"
     self.MaxWorkerThreads = 12
Exemplo n.º 11
0
 def __init__(self):
     Config.__init__(self)
     self.PolitenessDelay = 1000
     #self.MaxDepth = 4
     self.UserAgentString = "UCI Inf141-CS121 crawler 67995387 90117275 14971857"
Exemplo n.º 12
0
 def __init__(self):
     Config.__init__(self)
     self.UserAgentString = "UCI Inf141-CS121 crawler 33819914"
Exemplo n.º 13
0
 def __init__(self):
     Config.__init__(self)
     self.UserAgentString = "INF141 <28859606 76439804 50233729 71903006>"
Exemplo n.º 14
0
 def __init__(self):
     Config.__init__(self)
     self.UserAgentString = "UCI Inf141-CS121 crawler ratkins1"
     self.log = "log.txt"
     self.contentLog = "content.txt"
     self.content = defaultdict(dict)