예제 #1
0
 def __init__(self):
     time_counter(print_to_console=False)
     print("初始化 RemoteIO")
     self.db = MongoClient('localhost', 20000).get_database("tokenizer").get_collection('splited_sentences')
     self.sentence_size = self.db.find().count()
     self.step = self.sentence_size
     self.skip = 0
     time_counter("初始化完毕")
예제 #2
0
 def read_sentence_randomly(self):
     while self.skip + self.step >= self.sentence_size:
         print("skip:%d, step:%d, size:%d" %
               (self.skip, self.step, self.sentence_size))
         if self.step == 0:
             return None
         self.skip = 0
         self.step = int(self.step / 2)
     if self.step + self.skip < self.sentence_size:
         random_step = random.randint(0, self.step)
         # print("获取 skip:%d" % self.skip+random_step)
         pipeline = [{"$skip": self.skip + random_step}, {"$limit": 1}]
         self.skip += random_step
         docs = list(self.db.aggregate(pipeline))
         doc = docs[0] if len(docs) > 0 else None
         self.db.update({"_id": doc["_id"]}, {"$inc": {"analysed": 1}})
         time_counter("已获取到")
         return doc
     else:
         return None