def apply(self, xs, clear=True, parallelism=None, progress_bar=True, count=None, **kwargs): """ Apply the given UDF to the set of objects xs, either single or multi-threaded, and optionally calling clear() first. """ # Clear everything downstream of this UDF if requested if clear: self.logger.info("Clearing existing...") Session = new_sessionmaker() session = Session() self.clear(session, **kwargs) session.commit() session.close() # Execute the UDF self.logger.info("Running UDF...") if parallelism is None or parallelism < 2: self.apply_st(xs, progress_bar, clear=clear, count=count, **kwargs) else: self.apply_mt(xs, parallelism, clear=clear, **kwargs)
def run(self) -> None: """ This method is called when the UDF is run as a Process in a multiprocess setting The basic routine is: get from JoinableQueue, apply, put / add outputs, loop """ # Each UDF starts its own Engine # See SQLalchemy, using connection pools with multiprocessing. Session = new_sessionmaker() session = Session() while True: try: doc = self.in_queue.get_nowait() # Merge the object with the session owned by the current child process. # If transient (ie not saved), save the object to the database. # If not, load it from the database w/o the overhead of reconciliation. if inspect(doc).transient: # This only happens during parser.apply doc = session.merge(doc, load=True) else: doc = session.merge(doc, load=False) y = self.apply(doc, **self.apply_kwargs) self.out_queue.put(y) except Empty: break session.commit() session.close()
def load_matrix(self, split, ignore_keys=[]): Session = new_sessionmaker() session = Session() candidates = session.query(Candidate).filter(Candidate.split == split).all() with _meta.engine.connect() as con: return load_annotation_matrix( con, candidates, split, self.table_name, self.key_table_name, False, None, False, ignore_keys, )
def __init__(self, in_queue=None, out_queue=None, worker_id=0): """ in_queue: A Queue of input objects to process; primarily for running in parallel """ Process.__init__(self) self.daemon = True self.in_queue = in_queue self.out_queue = out_queue self.worker_id = worker_id # Each UDF starts its own Engine # See SQLalchemy, using connection pools with multiprocessing. Session = new_sessionmaker() self.session = Session() # We use a workaround to pass in the apply kwargs self.apply_kwargs = {}
def run(self) -> None: """ This method is called when the UDF is run as a Process in a multiprocess setting The basic routine is: get from JoinableQueue, apply, put / add outputs, loop """ # Each UDF starts its own Engine # See SQLalchemy, using connection pools with multiprocessing. Session = new_sessionmaker() self.session = Session() while True: try: doc = self.in_queue.get_nowait() self.session.add_all( y for y in self.apply(doc, **self.apply_kwargs)) self.out_queue.put(UDF.TASK_DONE) except Empty: break self.session.commit() self.session.close()
def apply(self, xs, clear=True, parallelism=None, progress_bar=True, count=None, **kwargs): """ Apply the given UDF to the set of objects xs, either single or multi-threaded, and optionally calling clear() first. """ # Clear everything downstream of this UDF if requested if clear: self.logger.info("Clearing existing...") Session = new_sessionmaker() session = Session() self.clear(session, **kwargs) session.commit() session.close() # Execute the UDF self.logger.info("Running UDF...") # Setup progress bar if progress_bar and hasattr(xs, "__len__") or count is not None: self.logger.debug("Setting up progress bar...") n = count if count is not None else len(xs) self.pb = tqdm(total=n) if parallelism is None or parallelism < 2: self.apply_st(xs, clear=clear, count=count, **kwargs) else: self.apply_mt(xs, parallelism, clear=clear, **kwargs) # Close progress bar if self.pb is not None: self.logger.debug("Closing progress bar...") self.pb.close()
def run(self) -> None: """Run function of UDF. Call this method when the UDF is run as a Process in a multiprocess setting The basic routine is: get from JoinableQueue, apply, put / add outputs, loop """ # Each UDF starts its own Engine # See SQLalchemy, using connection pools with multiprocessing. Session = new_sessionmaker() session = Session() while True: doc = self.in_queue.get() # block until an item is available if doc == UDF.TASK_DONE: break # Merge the object with the session owned by the current child process. # This does not happen during parsing when doc is transient. if not inspect(doc).transient: doc = session.merge(doc, load=False) y = self.apply(doc, **self.apply_kwargs) self.out_queue.put((doc.name, y)) session.commit() session.close()
def __init__( self, in_queue: Optional[Queue] = None, out_queue: Optional[JoinableQueue] = None, worker_id: int = 0, **udf_init_kwargs: Any, ) -> None: """ in_queue: A Queue of input objects to process; primarily for running in parallel """ super().__init__() self.daemon = True self.in_queue = in_queue self.out_queue = out_queue self.worker_id = worker_id # Each UDF starts its own Engine # See SQLalchemy, using connection pools with multiprocessing. Session = new_sessionmaker() self.session = Session() # We use a workaround to pass in the apply kwargs self.apply_kwargs: Dict[str, Any] = {}
def _apply( self, doc_loader: Collection[Document], parallelism: int, **kwargs: Any ) -> None: """Run the UDF multi-threaded using python multiprocessing.""" if not Meta.postgres: raise ValueError("Fonduer must use PostgreSQL as a database backend.") # Create an input queue to feed documents to UDF workers manager = Manager() # Set maxsize (#435). The number is heuristically determined. in_queue = manager.Queue(maxsize=parallelism * 2) # Use an output queue to track multiprocess progress out_queue = manager.Queue() # Clear the last documents parsed by the last run self.last_docs = set() # Create DB session factory for insert data on each UDF (#545) session_factory = new_sessionmaker() # Create UDF Processes for i in range(parallelism): udf = self.udf_class( session_factory=session_factory, runner=self, in_queue=in_queue, out_queue=out_queue, worker_id=i, **self.udf_init_kwargs, ) udf.apply_kwargs = kwargs self.udfs.append(udf) # Start the UDF processes for udf in self.udfs: udf.start() # Fill input queue with documents but # of docs in queue is capped (#435). def in_thread_func() -> None: # Do not use session here to prevent concurrent use (#482). for doc in doc_loader: in_queue.put(doc) # block until a free slot is available Thread(target=in_thread_func).start() count_parsed = 0 total_count = len(doc_loader) while ( any([udf.is_alive() for udf in self.udfs]) or not out_queue.empty() ) and count_parsed < total_count: # Get doc from the out_queue and persist the result into postgres try: doc_name = out_queue.get() # block until an item is available self.last_docs.add(doc_name) # Update progress bar whenever an item has been processed count_parsed += 1 if self.pb is not None: self.pb.update(1) except Exception as e: # Raise an error for all the other exceptions. raise (e) # Join the UDF processes for _ in self.udfs: in_queue.put(UDF.TASK_DONE) for udf in self.udfs: udf.join() # Flush the processes self.udfs = []
def apply(self, split, key_group=0, replace_key_set=True, update_keys=False, update_values=True, storage=None, ignore_keys=[], **kwargs): if update_keys: replace_key_set = False # Get the cids based on the split, and also the count Session = new_sessionmaker() session = Session() # Note: In the current UDFRunner implementation, we load all these into memory and fill a # multiprocessing JoinableQueue with them before starting... so might as well load them here and pass in. # Also, if we try to pass in a query iterator instead, with AUTOCOMMIT on, we get a TXN error... candidates = session.query(Candidate).filter( Candidate.split == split).all() cids_count = len(candidates) if cids_count == 0: raise ValueError("No candidates in current split") # Setting up job batches chunks = cids_count // self.batch_size batch_range = [(i * self.batch_size, (i + 1) * self.batch_size) for i in range(chunks)] remainder = cids_count % self.batch_size if remainder: batch_range.append((chunks * self.batch_size, cids_count)) old_table_name = None table_name = self.table_name # Run the Annotator with _meta.engine.connect() as con: table_already_exists = table_exists(con, table_name) if update_values and table_already_exists: # Now we extract under a temporary name for merging old_table_name = table_name table_name += "_updates" segment_file_blob = os.path.join( segment_dir, _segment_filename(_meta.DBNAME, self.table_name, split)) remove_files(segment_file_blob) cache = True if self.annotation_type == "feature" else False super(BatchAnnotator, self).apply(batch_range, table_name=self.table_name, split=split, cache=cache, **kwargs) # Insert and update keys if not table_already_exists or old_table_name: con.execute( "CREATE TABLE %s(candidate_id integer PRIMARY KEY, keys text[] NOT NULL, values real[] NOT NULL)" % table_name) copy_postgres(segment_file_blob, table_name, "candidate_id, keys, values") remove_files(segment_file_blob) # Replace the LIL table with COO if requested if storage == "COO": temp_coo_table = table_name + "_COO" con.execute( "CREATE TABLE %s AS " "(SELECT candidate_id, UNNEST(keys) as key, UNNEST(values) as value from %s)" % (temp_coo_table, table_name)) con.execute("DROP TABLE %s" % table_name) con.execute("ALTER TABLE %s RENAME TO %s" % (temp_coo_table, table_name)) con.execute( "ALTER TABLE %s ADD PRIMARY KEY(candidate_id, key)" % table_name) # Update old table if old_table_name: con.execute( "INSERT INTO %s SELECT * FROM %s ON CONFLICT(candidate_id, key) " "DO UPDATE SET value=EXCLUDED.value" % (old_table_name, table_name)) con.execute("DROP TABLE %s" % table_name) else: # LIL # Update old table if old_table_name: con.execute( "INSERT INTO %s AS old SELECT * FROM %s ON CONFLICT(candidate_id) " "DO UPDATE SET " "values=old.values || EXCLUDED.values," "keys=old.keys || EXCLUDED.keys" % (old_table_name, table_name)) con.execute("DROP TABLE %s" % table_name) if old_table_name: table_name = old_table_name # Load the matrix key_table_name = self.key_table_name if key_group: key_table_name = self.key_table_name + "_" + get_sql_name( key_group) return load_annotation_matrix( con, candidates, split, table_name, key_table_name, replace_key_set, storage, update_keys, ignore_keys, )