def create(cls, key, label, user, description=None, locale=None): from group import Group from membership import Membership from page import Page instance = Instance(unicode(key).lower(), label, user) instance.description = description instance.default_group = Group.by_code(Group.INSTANCE_DEFAULT) if locale is not None: instance.locale = locale meta.Session.add(instance) supervisor_group = Group.by_code(Group.CODE_SUPERVISOR) membership = Membership(user, instance, supervisor_group, approved=True) meta.Session.add(membership) if config.get_bool('adhocracy.create_initial_instance_page'): Page.create(instance, label, u"", user) # Autojoin the user in instances config_autojoin = config.get('adhocracy.instances.autojoin') if (config_autojoin and (config_autojoin == 'ALL' or key in (k.strip() for k in config_autojoin.split(',')))): users = adhocracy.model.User.all() for u in users: autojoin_membership = Membership(u, instance, instance.default_group) meta.Session.add(autojoin_membership) meta.Session.flush() return instance
def crawler(name, query): '''Main Crawler for Job''' start = datetime.now() print name db = Database(name) db.create_colls() #get from source for n in db.sources.find(): if n["url"] not in db.queue.distinct("url"): db.queue.insert(n) while db.queue.count > 0: print "Beginning crawl" # print "Number of seeds urls in sources databases:", db.sources.count() # print "Number of pending url to inspect:", len(db.queue.distinct("url")) for url in db.queue.distinct("url"): if url not in db.results.find({"url": url}): print url p = Page(url, query) if p.create(): a = Article() else: print p.error_type #print "Links", p.outlinks #db.results.update(p.info, {'$push': {"date": datetime.today()}}, upsert=True) #db.results.insert(p.info) # if p.outlinks is not None: # try: # for n_url in p.outlinks: # if n_url is not None or n_url not in db.queue.find({"url":n_url}) or n_url not in db.results.find({"url":n_url}) or n_url not in db.log.find({"url":n_url}): # # Checking correct url before is problematic # # next_p = Page(n_url, query) # # if next_p.clean_url(p.url) is not None: # print n_url # db.queue.insert({"url":n_url}) # except mongo_err: # db.log.udpate({"url":url, "error_type": "pymongo error inserting outlinks", "query": self.query, "status":False},{'$push': {"date": datetime.today()}}, upsert=True) # elif p.error_type != 0: # ''' if the page is not relevant do not store in db''' # db.log.update(p.bad_status(),{'$push':{"date": datetime.today()}}, upsert=True) # else: # continue db.queue.remove({"url": url}) if db.queue.count() == 0: print db.stats() break if db.queue.count() == 0: print db.stats() break end = datetime.now() elapsed = end - start print "crawl finished, %i results and %i sources are stored in Mongo Database: %s in %s" % ( db.results.count(), db.sources.count(), name, elapsed) return True
def create(cls, key, label, user, description=None, locale=None): from group import Group from membership import Membership from page import Page instance = Instance(unicode(key).lower(), label, user) instance.description = description instance.default_group = Group.by_code(Group.INSTANCE_DEFAULT) if locale is not None: instance.locale = locale meta.Session.add(instance) supervisor_group = Group.by_code(Group.CODE_SUPERVISOR) membership = Membership(user, instance, supervisor_group, approved=True) meta.Session.add(membership) Page.create(instance, label, u"", user) meta.Session.flush() return instance
def add_page(self, binary): from page import Page page = Page.create(day=self) page.set_image(binary=binary) self.pages.append(page) self.pages.reorder() db.session.commit()
def crawl(self): self.discovery() start = datetime.now() while self.db.queue.count > 0: for url in self.db.queue.distinct("url"): print url, self.query p = Page(url, self.query) page = p.create() print page self.db.queue.remove({"url": url}) if self.db.queue.count() == 0: break if self.db.queue.count() == 0: print self.db.stats() break end = datetime.now() elapsed = end - start print "crawl finished, %i results and %i sources are stored in Mongo Database: %s in %s" %(self.db.results.count(),self.db.sources.count(),self.project, elapsed)
def crawl(self): self.discovery() start = datetime.now() while self.db.queue.count > 0: for url in self.db.queue.distinct("url"): print url, self.query p = Page(url, self.query) page = p.create() print page self.db.queue.remove({"url": url}) if self.db.queue.count() == 0: break if self.db.queue.count() == 0: print self.db.stats() break end = datetime.now() elapsed = end - start print "crawl finished, %i results and %i sources are stored in Mongo Database: %s in %s" % ( self.db.results.count(), self.db.sources.count(), self.project, elapsed)
def crawler(name, query): '''Main Crawler for Job''' start = datetime.now() print name db = Database(name) db.create_colls() #get from source for n in db.sources.find(): if n["url"] not in db.queue.distinct("url"): db.queue.insert(n) while db.queue.count > 0: print "Beginning crawl" # print "Number of seeds urls in sources databases:", db.sources.count() # print "Number of pending url to inspect:", len(db.queue.distinct("url")) for url in db.queue.distinct("url"): if url not in db.results.find({"url":url}): print url p = Page(url, query) if p.create(): a = Article() else: print p.error_type #print "Links", p.outlinks #db.results.update(p.info, {'$push': {"date": datetime.today()}}, upsert=True) #db.results.insert(p.info) # if p.outlinks is not None: # try: # for n_url in p.outlinks: # if n_url is not None or n_url not in db.queue.find({"url":n_url}) or n_url not in db.results.find({"url":n_url}) or n_url not in db.log.find({"url":n_url}): # # Checking correct url before is problematic # # next_p = Page(n_url, query) # # if next_p.clean_url(p.url) is not None: # print n_url # db.queue.insert({"url":n_url}) # except mongo_err: # db.log.udpate({"url":url, "error_type": "pymongo error inserting outlinks", "query": self.query, "status":False},{'$push': {"date": datetime.today()}}, upsert=True) # elif p.error_type != 0: # ''' if the page is not relevant do not store in db''' # db.log.update(p.bad_status(),{'$push':{"date": datetime.today()}}, upsert=True) # else: # continue db.queue.remove({"url": url}) if db.queue.count() == 0: print db.stats() break if db.queue.count() == 0: print db.stats() break end = datetime.now() elapsed = end - start print "crawl finished, %i results and %i sources are stored in Mongo Database: %s in %s" %(db.results.count(),db.sources.count(),name, elapsed) return True