def get_ids(self, urls): """ Return a list of IDs for these URLs """ ret = [] for u in urls: url = URL(u) ret.append(make_url_id(url)) return ret
def import_dump(self): """ Read a dump from an URL or a local file, and merge its data in RocksDB """ db = Storage(read_only=False) write_batch = db.write_batch(None) start_time = time.time() done = 0 for url, values in self.iter_rows(): # TODO: RocksDB merge operator? existing_value = db.get(url) existing_pb = urlserver_pb2.UrlMetadata() if existing_value is not None: existing_pb.ParseFromString(existing_value) else: # In order to send the protobuf message untouched via RPC, we pre-compute the ID existing_pb.id = make_url_id(URL(url)) for k, v in values.iteritems(): if k in ("ut1_blacklist", ): for elt in v: existing_pb.ut1_blacklist.append(elt) # pylint: disable=no-member else: setattr(existing_pb, k, v) # print "IMPORT", key, existing_pb write_batch.put(url, existing_pb.SerializeToString()) done += 1 if self.dump_batch_size and (done % self.dump_batch_size) == 0: eta = 0 if self.dump_count_estimate: eta = float( self.dump_count_estimate - done ) / ( 3600.0 * done / (time.time() - start_time) ) print("Done %s (%s/s, ~%0.2f%%, ETA %0.2fh)" % ( done, int(done / (time.time() - start_time)), (float(done * 100) / self.dump_count_estimate) if self.dump_count_estimate else 0, eta )) write_batch = db.write_batch(write_batch) print("Total rows: %s" % done) db.write_batch(write_batch) db.close()
def import_dump(self): """ Read a dump from an URL or a local file, and merge its data in RocksDB """ db = Storage(read_only=False) write_batch = db.write_batch(None) batch_time = time.time() done = 0 for i, row in self.iter_dump(): for key, values in self.import_row(i, row): url = key.encode("utf-8") # TODO: RocksDB merge operator? existing_value = db.get(url) existing_pb = urlserver_pb2.UrlMetadata() if existing_value is not None: existing_pb.ParseFromString(existing_value) else: # In order to send the protobuf message untouched via RPC, we pre-compute the ID existing_pb.id = make_url_id(URL(url)) for k, v in values.iteritems(): if k in ("ut1_blacklist", ): for elt in v: existing_pb.ut1_blacklist.append(elt) # pylint: disable=no-member else: setattr(existing_pb, k, v) # print "IMPORT", key, existing_pb write_batch.put(url, existing_pb.SerializeToString()) done += 1 if self.dump_batch_size and (done % self.dump_batch_size) == 0: print "Done %s (%s/s)" % (done, int(done / (time.time() - batch_time))) write_batch = db.write_batch(write_batch) batch_time = time.time() print "Total rows: %s" % done db.write_batch(write_batch) db.close()
def get_metadata(self, urls): """ Return a list of tuples of metadata for these *normalized* URLs """ ret = [] for url in urls: data = db.get(url) # If the URL has been in none of our static databases, we still want to return an ID if data is None: obj = urlserver_pb2.UrlMetadata() obj.id = make_url_id(URL(url)) data = obj.SerializeToString() ret.append(data) return ret