示例#1
0
 def get_ids(self, urls):
     """ Return a list of IDs for these URLs """
     ret = []
     for u in urls:
         url = URL(u)
         ret.append(make_url_id(url))
     return ret
示例#2
0
    def import_dump(self):
        """ Read a dump from an URL or a local file, and merge its data in RocksDB """

        db = Storage(read_only=False)

        write_batch = db.write_batch(None)
        start_time = time.time()

        done = 0

        for url, values in self.iter_rows():

            # TODO: RocksDB merge operator?
            existing_value = db.get(url)
            existing_pb = urlserver_pb2.UrlMetadata()

            if existing_value is not None:
                existing_pb.ParseFromString(existing_value)
            else:
                # In order to send the protobuf message untouched via RPC, we pre-compute the ID
                existing_pb.id = make_url_id(URL(url))

            for k, v in values.iteritems():
                if k in ("ut1_blacklist", ):
                    for elt in v:
                        existing_pb.ut1_blacklist.append(elt)  # pylint: disable=no-member
                else:
                    setattr(existing_pb, k, v)

            # print "IMPORT", key, existing_pb

            write_batch.put(url, existing_pb.SerializeToString())

            done += 1

            if self.dump_batch_size and (done % self.dump_batch_size) == 0:

                eta = 0
                if self.dump_count_estimate:
                    eta = float(
                        self.dump_count_estimate - done
                    ) / (
                        3600.0 * done / (time.time() - start_time)
                    )

                print("Done %s (%s/s, ~%0.2f%%, ETA %0.2fh)" % (
                    done,
                    int(done / (time.time() - start_time)),
                    (float(done * 100) / self.dump_count_estimate) if self.dump_count_estimate else 0,
                    eta
                ))
                write_batch = db.write_batch(write_batch)

        print("Total rows: %s" % done)
        db.write_batch(write_batch)
        db.close()
示例#3
0
    def import_dump(self):
        """ Read a dump from an URL or a local file, and merge its data in RocksDB """

        db = Storage(read_only=False)

        write_batch = db.write_batch(None)
        batch_time = time.time()

        done = 0
        for i, row in self.iter_dump():

            for key, values in self.import_row(i, row):

                url = key.encode("utf-8")

                # TODO: RocksDB merge operator?
                existing_value = db.get(url)
                existing_pb = urlserver_pb2.UrlMetadata()
                if existing_value is not None:
                    existing_pb.ParseFromString(existing_value)
                else:
                    # In order to send the protobuf message untouched via RPC, we pre-compute the ID
                    existing_pb.id = make_url_id(URL(url))

                for k, v in values.iteritems():
                    if k in ("ut1_blacklist", ):
                        for elt in v:
                            existing_pb.ut1_blacklist.append(elt)  # pylint: disable=no-member
                    else:
                        setattr(existing_pb, k, v)

                # print "IMPORT", key, existing_pb

                write_batch.put(url, existing_pb.SerializeToString())

                done += 1

                if self.dump_batch_size and (done % self.dump_batch_size) == 0:
                    print "Done %s (%s/s)" % (done,
                                              int(done /
                                                  (time.time() - batch_time)))
                    write_batch = db.write_batch(write_batch)
                    batch_time = time.time()

        print "Total rows: %s" % done
        db.write_batch(write_batch)
        db.close()
示例#4
0
    def get_metadata(self, urls):
        """ Return a list of tuples of metadata for these *normalized* URLs """

        ret = []
        for url in urls:

            data = db.get(url)

            # If the URL has been in none of our static databases, we still want to return an ID
            if data is None:
                obj = urlserver_pb2.UrlMetadata()
                obj.id = make_url_id(URL(url))
                data = obj.SerializeToString()

            ret.append(data)

        return ret