Пример #1
0
 def new_WE(self,
            weid,
            prefixes,
            name=None,
            status="DISCOVERED",
            startpages=[],
            tags={}):
     timestamp = now_ts()
     if not name:
         for p in prefixes:
             try:
                 name = name_lru(prefixes[0])
                 break
             except ValueError:
                 pass
         else:
             name = prefixes[0]
     return {
         "_id": weid,
         "prefixes": prefixes,
         "name": name,
         "status": status,
         "tags": tags,
         "homepage": None,
         "startpages": startpages,
         "crawled": False,
         "creationDate": timestamp,
         "lastModificationDate": timestamp
     }
Пример #2
0
 def add_corpus(self, corpus, name, password, options, tlds=None):
     now = now_ts()
     yield self.db["corpus"].insert({
       "_id": corpus,
       "name": name,
       "password": salt(password),
       "options": options,
       "total_webentities": 0,
       "webentities_in": 0,
       "webentities_in_untagged": 0,
       "webentities_in_uncrawled": 0,
       "webentities_out": 0,
       "webentities_undecided": 0,
       "webentities_discovered": 0,
       "total_crawls": 0,
       "total_pages": 0,
       "total_pages_crawled": 0,
       "created_at": now,
       "last_activity": now,
       "recent_changes": False,
       "last_index_loop": now,
       "links_duration": 1,
       "last_links_loop": 0,
       "tlds": tlds
     }, safe=True)
     yield self.init_corpus_indexes(corpus)
Пример #3
0
 def add_corpus(self, corpus, name, password, options, tlds=None):
     now = now_ts()
     yield self.db()["corpus"].insert_one({
       "_id": corpus,
       "name": name,
       "password": salt(password),
       "options": options,
       "total_webentities": 0,
       "webentities_in": 0,
       "webentities_in_untagged": 0,
       "webentities_in_uncrawled": 0,
       "webentities_out": 0,
       "webentities_undecided": 0,
       "webentities_discovered": 0,
       "total_crawls": 0,
       "crawls_pending": 0,
       "crawls_running": 0,
       "total_pages": 0,
       "total_pages_crawled": 0,
       "total_pages_queued": 0,
       "total_links_found": 0,
       "recent_changes": False,
       "last_index_loop": now,
       "links_duration": 1,
       "last_links_loop": 0,
       "tags": Binary(msgpack.packb({})),
       "webentities_links": Binary(msgpack.packb({})),
       "created_at": now,
       "last_activity": now,
       "tlds": tlds
     })
     yield self.init_corpus_indexes(corpus)
Пример #4
0
    def depile(self):
        if self.queue is None:
            yield self.init_queue()
        if not len(self.queue):
            returnD(None)

        status = yield self.get_scrapyd_status()
        if status["pending"] > 0:
            returnD(None)
        # Add some random wait to allow possible concurrent Hyphe instance
        # to compete for ScrapyD's empty slots
        yield deferredSleep(1./randint(4,20))

        # Order jobs by corpus with less currently running crawls then age
        ordered = sorted(self.queue.items(), key=lambda x: \
          float("%s.%s" % (status.get(x[1]["corpus"], 0), x[1]["timestamp"])))
        job_id, job = ordered[0]
        res = yield self.send_scrapy_query('schedule', job["crawl_arguments"])
        ts = now_ts()
        if is_error(res):
            logger.msg("WARNING: error sending job %s to ScrapyD: %s" % (job, res))
            self.queue[job_id]['timestamp'] = ts    # let it retry a bit later
        else:
            yield self.db.update_job(job["corpus"], job_id, res['jobid'], ts)
            yield self.db.add_log(job["corpus"], job_id, "CRAWL_SCHEDULED", ts)
            del(self.queue[job_id])
Пример #5
0
 def update_job(self, corpus, job_id, crawl_id, timestamp=None):
     if not timestamp:
         timestamp = now_ts()
     yield self.jobs(corpus).update_one(
         {"_id": job_id},
         {"$set": {
             "crawljob_id": crawl_id,
             "scheduled_at": timestamp
         }})
Пример #6
0
 def save_stats(self, corpus, corpus_metas):
     yield self.stats(corpus).insert({
       "timestamp": now_ts(),
       "total": corpus_metas["total_webentities"],
       "in": corpus_metas['webentities_in'],
       "out": corpus_metas['webentities_out'],
       "discovered": corpus_metas['webentities_discovered'],
       "undecided": corpus_metas['webentities_undecided']
     }, safe=True)
Пример #7
0
 def add_log(self, corpus, job, msg, timestamp=None):
     if not timestamp:
         timestamp = now_ts()
     if type(job) != list:
         job = [job]
     yield self.logs(corpus).insert_many([{
         '_job': _id,
         'timestamp': timestamp,
         'log': msg
     } for _id in job])
Пример #8
0
 def add_job(self, args, corpus, webentity_id):
     ts = now_ts()
     job_id = yield self.db.add_job(corpus, webentity_id, args, ts)
     self.queue[job_id] = {
       "corpus": corpus,
       "timestamp": ts,
       "crawl_arguments": args
     }
     yield self.db.add_log(corpus, job_id, "CRAWL_ADDED", ts)
     returnD(job_id)
Пример #9
0
 def save_stats(self, corpus, corpus_metas):
     yield self.stats(corpus).insert(
         {
             "timestamp": now_ts(),
             "total": corpus_metas["total_webentities"],
             "in": corpus_metas['webentities_in'],
             "out": corpus_metas['webentities_out'],
             "discovered": corpus_metas['webentities_discovered'],
             "undecided": corpus_metas['webentities_undecided']
         },
         safe=True)
Пример #10
0
 def save_stats(self, corpus, corpus_metas):
     new = {
         "total": corpus_metas["total_webentities"],
         "in": corpus_metas['webentities_in'],
         "in_untagged": corpus_metas['webentities_in_untagged'],
         "in_uncrawled": corpus_metas['webentities_in_uncrawled'],
         "out": corpus_metas['webentities_out'],
         "discovered": corpus_metas['webentities_discovered'],
         "undecided": corpus_metas['webentities_undecided']
     }
     old = yield self.get_last_stats(corpus)
     if old:
         del (old["timestamp"], old["_id"])
     if not old or old != new:
         new["timestamp"] = now_ts()
         yield self.stats(corpus).insert_one(new)
Пример #11
0
 def save_stats(self, corpus, corpus_metas):
     new = {
       "total": corpus_metas["total_webentities"],
       "in": corpus_metas['webentities_in'],
       "in_untagged": corpus_metas['webentities_in_untagged'],
       "in_uncrawled": corpus_metas['webentities_in_uncrawled'],
       "out": corpus_metas['webentities_out'],
       "discovered": corpus_metas['webentities_discovered'],
       "undecided": corpus_metas['webentities_undecided']
     }
     old = yield self.get_last_stats(corpus)
     if old:
         del(old["timestamp"], old["_id"])
     if not old or old != new:
         new["timestamp"] = now_ts()
         yield self.stats(corpus).insert_one(new)
Пример #12
0
 def add_job(self, corpus, job_id, webentity_id, args, timestamp=None):
     if not timestamp:
         timestamp = now_ts()
     yield self.jobs(corpus).insert({
       "_id": job_id,
       "webentity_id": webentity_id,
       "nb_crawled_pages": 0,
       "nb_pages": 0,
       "nb_links": 0,
       "crawl_arguments": args,
       "crawling_status": crawling_statuses.PENDING,
       "indexing_status": indexing_statuses.PENDING,
       "created_at": timestamp,
       "started_at": None,
       "crawled_at": None,
       "finished_at": None
     }, safe=True)
Пример #13
0
 def add_corpus(self, corpus, name, password, options):
     now = now_ts()
     yield self.db["corpus"].insert({
       "_id": corpus,
       "name": name,
       "password": salt(password),
       "options": options,
       "total_webentities": 0,
       "webentities_in": 0,
       "webentities_out": 0,
       "webentities_undecided": 0,
       "webentities_discovered": 0,
       "total_crawls": 0,
       "total_pages": 0,
       "total_pages_crawled": 0,
       "created_at": now,
       "last_activity": now,
       "last_index_loop": now,
       "last_links_loop": now
     }, safe=True)
     yield self.init_corpus_indexes(corpus)
Пример #14
0
 def add_job(self, corpus, webentity_id, args, timestamp=None):
     if not timestamp:
         timestamp = now_ts()
     _id = str(uuid())
     yield self.jobs(corpus).insert_one({
       "_id": _id,
       "crawljob_id": None,
       "webentity_id": webentity_id,
       "nb_crawled_pages": 0,
       "nb_unindexed_pages": 0,
       "nb_pages": 0,
       "nb_links": 0,
       "crawl_arguments": args,
       "crawling_status": crawling_statuses.PENDING,
       "indexing_status": indexing_statuses.PENDING,
       "created_at": timestamp,
       "scheduled_at": None,
       "started_at": None,
       "crawled_at": None,
       "finished_at": None
     })
     returnD(_id)
Пример #15
0
 def new_WE(self, weid, prefixes, name=None, status="DISCOVERED", startpages=[], tags={}):
     timestamp = now_ts()
     if not name:
         for p in prefixes:
             try:
                 name = name_lru(prefixes[0])
                 break
             except ValueError:
                 pass
         else:
             name = prefixes[0]
     return {
       "_id": weid,
       "prefixes": prefixes,
       "name": name,
       "status": status,
       "tags": tags,
       "homepage": None,
       "startpages": startpages,
       "crawled": False,
       "creationDate": timestamp,
       "lastModificationDate": timestamp
     }
Пример #16
0
 def upsert_WE(self, corpus, weid, metas, update_timestamp=True):
     if update_timestamp:
         metas["lastModificationDate"] = now_ts()
     yield self.WEs(corpus).update_one({"_id": weid}, {"$set": metas},
                                       upsert=True)
Пример #17
0
 def add_log(self, corpus, job, msg, timestamp=None):
     if not timestamp:
         timestamp = now_ts()
     if type(job) != list:
         job = [job]
     yield self.logs(corpus).insert([{'_job': _id, 'timestamp': timestamp, 'log': msg} for _id in job], multi=True, safe=True)
Пример #18
0
 def update_job(self, corpus, job_id, crawl_id, timestamp=None):
     if not timestamp:
         timestamp = now_ts()
     yield self.jobs(corpus).update({"_id": job_id}, {"$set": {"crawljob_id": crawl_id, "scheduled_at": timestamp}}, safe=True)
Пример #19
0
 def upsert_WE(self, corpus, weid, metas, updateTimestamp=True):
     if updateTimestamp:
         metas["lastModificationDate"] = now_ts()
     yield self.WEs(corpus).update_one({"_id": weid}, {"$set": metas}, upsert=True)