def insert_url(self,url): "insert url directly into data and next_url to seeds" info = Page({"url": url, "source_url": "url", "depth": 0}, self.task) info.process(False) try: self.data.insert_one(info.set_data()) except pymongo.errors.DuplicateKeyError: date = self.date.replace(hour=0) p_date = (info.date[-1]).replace(hour=0) if p_date == date: print "Already in processing queue today. No need to update then!" #self.queue.delete_one({"url":info.url}) #return self.queue pass else: self.data.update_one({"url":url, "depth":0}, {"$push":info.add_data()}) if self.task["repeat"]: self.data.update_one({"url":url}, {"$inc":{"crawl_nb":1}}) if info.status: for link in info.outlinks: try: self.queue.insert_one(link) except pymongo.errors.DuplicateKeyError: continue except pymongo.errors.WriteError: print "Error", link pass return self.queue
def global_crawl(self): logger.debug("***************CRAWL********") while self.queue.count() > 0: print "%i urls in process" %self.queue.count() print "in which %i sources in process" %self.queue.count({"depth":0}) self.report.report("mail") for item in self.queue.find(no_cursor_timeout=True).sort([('depth', pymongo.ASCENDING)]): print "%i urls in process" %self.queue.count() #~ #Once a day #~ if self.task["repeat"] is False: #~ date = self.date.replace(hour=0) #~ p_date = p.date[-1].replace(hour=0) #~ if p_date == date: #~ print "Already treated today" #~ self.queue.delete_one({"url":p.url}) #~ continue #si c'est une source #~ if item["depth"] == 0: #~ print "is source" #~ self.queue.delete_one({"url": item["url"]}) #~ continue #~ else: page = Page(item, self.task) #pertinence status = page.process() try: #on cree et insere la page self.data.insert_one(page.set_data()) #self.data.update_one({"url":item["url"]}, {"$set":page.set_last(), "$inc":{"crawl_nb":1}}) if page.status: cpt = 0 if page.depth+1 < page.max_depth: for outlink in page.outlinks: if outlink["url"] not in self.data.distinct("url"): try: cpt = cpt+1 self.queue.insert_one(outlink) except pymongo.errors.DuplicateKeyError: continue else: continue print "adding %i new urls in queue with depth %i" %(cpt, page.depth+1) self.data.update_one({"url":item["url"]}, {"$set":{"type": "page"}}) else: self.data.update_one({"url":item["url"]}, {"$set":{"type": "log"}}) self.data.update_one({"url":item["url"]}, {"$push":page.add_data()}) self.queue.delete_one({"url": item["url"]}) continue except pymongo.errors.DuplicateKeyError: #~ if page.status: #~ self.data.update_one({"url":item["url"]}, {"$set":{"type": "page"}) #~ else: #~ self.data.update_one({"url":item["url"]}, {"$set":{"type": "log"}) #self.data.update_one({"url":item["url"]}, {"$push":page.add_data()} self.queue.delete_one({"url": item["url"]}) continue #check_last_modif #####################" #check_last_crawl ######################## #~ date = self.date.replace(hour=0) #~ p_date = page.date[-1] #~ p_date = (p_date).replace(hour=0, day=p_date.day+1) #~ print p_date, date #~ if p_date == date: #~ print "Already treated today" #~ self.queue.delete_one({"url":item['url']}) #~ continue #~ else: #check_last_modif #####################" #~ #if self.has_modification(): #~ if page.status: #diff btw page.outlinks and last_page.outlinks #~ for outlink in page.outlinks: #~ try: #~ self.queue.insert_one(outlink) #~ except pymongo.errors.DuplicateKeyError: #~ continue #~ self.data.update_one({"url":item["url"]}, {"$push": page.add_info(),"$set":page.set_last(), "$inc":{"crawl_nb":1}}) #~ else: #~ pass #~ self.data.update_one({"url":item["url"]}, {"$push": page.add_data(), "$inc":{"crawl_nb":1}}) #~ self.queue.delete_one({"url": item["url"]}) #~ continue #~ except Exception as e: #~ self.data.update_one({"url":item["url"]}, {"$push": {"msg":str(e), "status":False, "code":909, "date": self.date }}) #~ self.queue.delete_one({"url": item["url"]}) #~ continue s.report("mail") logger.debug("***************END********") #s = Stats(self.name) #s.show(self) self.report.report("mail") return True