Exemplo n.º 1
0
    def apply(self,
              xs,
              clear=True,
              parallelism=None,
              progress_bar=True,
              count=None,
              **kwargs):
        """
        Apply the given UDF to the set of objects xs, either single or
        multi-threaded, and optionally calling clear() first.
        """
        # Clear everything downstream of this UDF if requested
        if clear:
            self.logger.info("Clearing existing...")
            Session = new_sessionmaker()
            session = Session()
            self.clear(session, **kwargs)
            session.commit()
            session.close()

        # Execute the UDF
        self.logger.info("Running UDF...")
        if parallelism is None or parallelism < 2:
            self.apply_st(xs, progress_bar, clear=clear, count=count, **kwargs)
        else:
            self.apply_mt(xs, parallelism, clear=clear, **kwargs)
Exemplo n.º 2
0
 def run(self) -> None:
     """
     This method is called when the UDF is run as a Process in a
     multiprocess setting The basic routine is: get from JoinableQueue,
     apply, put / add outputs, loop
     """
     # Each UDF starts its own Engine
     # See SQLalchemy, using connection pools with multiprocessing.
     Session = new_sessionmaker()
     session = Session()
     while True:
         try:
             doc = self.in_queue.get_nowait()
             # Merge the object with the session owned by the current child process.
             # If transient (ie not saved), save the object to the database.
             # If not, load it from the database w/o the overhead of reconciliation.
             if inspect(doc).transient:  # This only happens during parser.apply
                 doc = session.merge(doc, load=True)
             else:
                 doc = session.merge(doc, load=False)
             y = self.apply(doc, **self.apply_kwargs)
             self.out_queue.put(y)
         except Empty:
             break
     session.commit()
     session.close()
Exemplo n.º 3
0
 def load_matrix(self, split, ignore_keys=[]):
     Session = new_sessionmaker()
     session = Session()
     candidates = session.query(Candidate).filter(Candidate.split == split).all()
     with _meta.engine.connect() as con:
         return load_annotation_matrix(
             con,
             candidates,
             split,
             self.table_name,
             self.key_table_name,
             False,
             None,
             False,
             ignore_keys,
         )
Exemplo n.º 4
0
    def __init__(self, in_queue=None, out_queue=None, worker_id=0):
        """
        in_queue: A Queue of input objects to process; primarily for running in parallel
        """
        Process.__init__(self)
        self.daemon = True
        self.in_queue = in_queue
        self.out_queue = out_queue
        self.worker_id = worker_id

        # Each UDF starts its own Engine
        # See SQLalchemy, using connection pools with multiprocessing.
        Session = new_sessionmaker()
        self.session = Session()

        # We use a workaround to pass in the apply kwargs
        self.apply_kwargs = {}
Exemplo n.º 5
0
 def run(self) -> None:
     """
     This method is called when the UDF is run as a Process in a
     multiprocess setting The basic routine is: get from JoinableQueue,
     apply, put / add outputs, loop
     """
     # Each UDF starts its own Engine
     # See SQLalchemy, using connection pools with multiprocessing.
     Session = new_sessionmaker()
     self.session = Session()
     while True:
         try:
             doc = self.in_queue.get_nowait()
             self.session.add_all(
                 y for y in self.apply(doc, **self.apply_kwargs))
             self.out_queue.put(UDF.TASK_DONE)
         except Empty:
             break
     self.session.commit()
     self.session.close()
Exemplo n.º 6
0
    def apply(self,
              xs,
              clear=True,
              parallelism=None,
              progress_bar=True,
              count=None,
              **kwargs):
        """
        Apply the given UDF to the set of objects xs, either single or
        multi-threaded, and optionally calling clear() first.
        """
        # Clear everything downstream of this UDF if requested
        if clear:
            self.logger.info("Clearing existing...")
            Session = new_sessionmaker()
            session = Session()
            self.clear(session, **kwargs)
            session.commit()
            session.close()

        # Execute the UDF
        self.logger.info("Running UDF...")

        # Setup progress bar
        if progress_bar and hasattr(xs, "__len__") or count is not None:
            self.logger.debug("Setting up progress bar...")
            n = count if count is not None else len(xs)
            self.pb = tqdm(total=n)

        if parallelism is None or parallelism < 2:
            self.apply_st(xs, clear=clear, count=count, **kwargs)
        else:
            self.apply_mt(xs, parallelism, clear=clear, **kwargs)

        # Close progress bar
        if self.pb is not None:
            self.logger.debug("Closing progress bar...")
            self.pb.close()
Exemplo n.º 7
0
    def run(self) -> None:
        """Run function of UDF.

        Call this method when the UDF is run as a Process in a
        multiprocess setting The basic routine is: get from JoinableQueue,
        apply, put / add outputs, loop
        """
        # Each UDF starts its own Engine
        # See SQLalchemy, using connection pools with multiprocessing.
        Session = new_sessionmaker()
        session = Session()
        while True:
            doc = self.in_queue.get()  # block until an item is available
            if doc == UDF.TASK_DONE:
                break
            # Merge the object with the session owned by the current child process.
            # This does not happen during parsing when doc is transient.
            if not inspect(doc).transient:
                doc = session.merge(doc, load=False)
            y = self.apply(doc, **self.apply_kwargs)
            self.out_queue.put((doc.name, y))
        session.commit()
        session.close()
Exemplo n.º 8
0
    def __init__(
        self,
        in_queue: Optional[Queue] = None,
        out_queue: Optional[JoinableQueue] = None,
        worker_id: int = 0,
        **udf_init_kwargs: Any,
    ) -> None:
        """
        in_queue: A Queue of input objects to process; primarily for running in parallel
        """
        super().__init__()
        self.daemon = True
        self.in_queue = in_queue
        self.out_queue = out_queue
        self.worker_id = worker_id

        # Each UDF starts its own Engine
        # See SQLalchemy, using connection pools with multiprocessing.
        Session = new_sessionmaker()
        self.session = Session()

        # We use a workaround to pass in the apply kwargs
        self.apply_kwargs: Dict[str, Any] = {}
Exemplo n.º 9
0
    def _apply(
        self, doc_loader: Collection[Document], parallelism: int, **kwargs: Any
    ) -> None:
        """Run the UDF multi-threaded using python multiprocessing."""
        if not Meta.postgres:
            raise ValueError("Fonduer must use PostgreSQL as a database backend.")

        # Create an input queue to feed documents to UDF workers
        manager = Manager()
        # Set maxsize (#435). The number is heuristically determined.
        in_queue = manager.Queue(maxsize=parallelism * 2)
        # Use an output queue to track multiprocess progress
        out_queue = manager.Queue()

        # Clear the last documents parsed by the last run
        self.last_docs = set()

        # Create DB session factory for insert data on each UDF (#545)
        session_factory = new_sessionmaker()
        # Create UDF Processes
        for i in range(parallelism):
            udf = self.udf_class(
                session_factory=session_factory,
                runner=self,
                in_queue=in_queue,
                out_queue=out_queue,
                worker_id=i,
                **self.udf_init_kwargs,
            )
            udf.apply_kwargs = kwargs
            self.udfs.append(udf)

        # Start the UDF processes
        for udf in self.udfs:
            udf.start()

        # Fill input queue with documents but # of docs in queue is capped (#435).
        def in_thread_func() -> None:
            # Do not use session here to prevent concurrent use (#482).
            for doc in doc_loader:
                in_queue.put(doc)  # block until a free slot is available

        Thread(target=in_thread_func).start()

        count_parsed = 0
        total_count = len(doc_loader)

        while (
            any([udf.is_alive() for udf in self.udfs]) or not out_queue.empty()
        ) and count_parsed < total_count:
            # Get doc from the out_queue and persist the result into postgres
            try:
                doc_name = out_queue.get()  # block until an item is available
                self.last_docs.add(doc_name)
                # Update progress bar whenever an item has been processed
                count_parsed += 1
                if self.pb is not None:
                    self.pb.update(1)
            except Exception as e:
                # Raise an error for all the other exceptions.
                raise (e)

        # Join the UDF processes
        for _ in self.udfs:
            in_queue.put(UDF.TASK_DONE)
        for udf in self.udfs:
            udf.join()

        # Flush the processes
        self.udfs = []
Exemplo n.º 10
0
    def apply(self,
              split,
              key_group=0,
              replace_key_set=True,
              update_keys=False,
              update_values=True,
              storage=None,
              ignore_keys=[],
              **kwargs):
        if update_keys:
            replace_key_set = False
        # Get the cids based on the split, and also the count
        Session = new_sessionmaker()
        session = Session()
        # Note: In the current UDFRunner implementation, we load all these into memory and fill a
        # multiprocessing JoinableQueue with them before starting... so might as well load them here and pass in.
        # Also, if we try to pass in a query iterator instead, with AUTOCOMMIT on, we get a TXN error...
        candidates = session.query(Candidate).filter(
            Candidate.split == split).all()
        cids_count = len(candidates)
        if cids_count == 0:
            raise ValueError("No candidates in current split")

        # Setting up job batches
        chunks = cids_count // self.batch_size
        batch_range = [(i * self.batch_size, (i + 1) * self.batch_size)
                       for i in range(chunks)]
        remainder = cids_count % self.batch_size
        if remainder:
            batch_range.append((chunks * self.batch_size, cids_count))

        old_table_name = None
        table_name = self.table_name
        # Run the Annotator
        with _meta.engine.connect() as con:
            table_already_exists = table_exists(con, table_name)
            if update_values and table_already_exists:
                # Now we extract under a temporary name for merging
                old_table_name = table_name
                table_name += "_updates"

            segment_file_blob = os.path.join(
                segment_dir,
                _segment_filename(_meta.DBNAME, self.table_name, split))
            remove_files(segment_file_blob)
            cache = True if self.annotation_type == "feature" else False
            super(BatchAnnotator, self).apply(batch_range,
                                              table_name=self.table_name,
                                              split=split,
                                              cache=cache,
                                              **kwargs)

            # Insert and update keys
            if not table_already_exists or old_table_name:
                con.execute(
                    "CREATE TABLE %s(candidate_id integer PRIMARY KEY, keys text[] NOT NULL, values real[] NOT NULL)"
                    % table_name)
            copy_postgres(segment_file_blob, table_name,
                          "candidate_id, keys, values")
            remove_files(segment_file_blob)

            # Replace the LIL table with COO if requested
            if storage == "COO":
                temp_coo_table = table_name + "_COO"
                con.execute(
                    "CREATE TABLE %s AS "
                    "(SELECT candidate_id, UNNEST(keys) as key, UNNEST(values) as value from %s)"
                    % (temp_coo_table, table_name))
                con.execute("DROP TABLE %s" % table_name)
                con.execute("ALTER TABLE %s RENAME TO %s" %
                            (temp_coo_table, table_name))
                con.execute(
                    "ALTER TABLE %s ADD PRIMARY KEY(candidate_id, key)" %
                    table_name)
                # Update old table
                if old_table_name:
                    con.execute(
                        "INSERT INTO %s SELECT * FROM %s ON CONFLICT(candidate_id, key) "
                        "DO UPDATE SET value=EXCLUDED.value" %
                        (old_table_name, table_name))
                    con.execute("DROP TABLE %s" % table_name)
            else:  # LIL
                # Update old table
                if old_table_name:
                    con.execute(
                        "INSERT INTO %s AS old SELECT * FROM %s ON CONFLICT(candidate_id) "
                        "DO UPDATE SET "
                        "values=old.values || EXCLUDED.values,"
                        "keys=old.keys || EXCLUDED.keys" %
                        (old_table_name, table_name))
                    con.execute("DROP TABLE %s" % table_name)

            if old_table_name:
                table_name = old_table_name
            # Load the matrix
            key_table_name = self.key_table_name
            if key_group:
                key_table_name = self.key_table_name + "_" + get_sql_name(
                    key_group)

            return load_annotation_matrix(
                con,
                candidates,
                split,
                table_name,
                key_table_name,
                replace_key_set,
                storage,
                update_keys,
                ignore_keys,
            )