def end(self): """Do the push once all Futures results are in""" dbg("Waiting for %d async futures..." % len(self.results)) timeout = self.context.get("timeout", None) close = self.context.get("close", None) loop = asyncio.get_event_loop() try: done, pending = loop.run_until_complete( asyncio.wait(self.results, timeout=timeout)) if timeout and pending: cancel_asyncio_tasks(pending, loop, cancel_timeout=ASYNCIO_CANCEL_TIMEOUT) raise asyncio.TimeoutError("%d/%d tasks pending" % (len(pending), len(self.results))) results = [task.result() for task in done] finally: if close: loop.close() if results and self.context.get("flatten", False): results = flatten(results) self.push(results)
def end(self): """Do the push once all results are in""" dbg("Waiting for %d RQ job(s)..." % len(self.results)) results = get_async_results(self.results) if results and self.context.get("flatten", False): results = flatten(results) self.push(results)
def consume(self, data=None, cleanup=None, split_count=None, synchronous=False, timeout=None, **node_contexts): """Setup node contexts and consume data with the pipeline Parameters ---------- data : iterable, optional Iterable of data to consume cleanup : dict, optional A mapping of arg names to clean up functions to be run after data processing is complete. split_count : int, optional How many slices to split the data into for parallel processing. Default is to inspect the celery app and set split_count = worker count. synchronous : bool, optional If False, return AsyncResults. If True, wait for tasks to complete and return their results, if any. timeout : int or float, optional If waiting for results, pass this as timeout to AsyncResult.get(). **node_contexts Keyword arguments that are node_name->param_dict """ if not split_count: dbg("determining split count from app celery worker count") app_stats = self.consume_task.app.control.inspect().stats() split_count = len(app_stats.keys()) split_count = split_count_helper(data, split_count) if data is None: splits = [None for s in range(split_count)] else: splits = divide_data(data, split_count) dbg("%s: data len: %s, splits: %d" % (self.__class__.__name__, size(data, "n/a"), split_count)) async_results = [] for split in splits: async_results.append( self.consume_task.delay(self.pipeline, split, cleanup=cleanup, **node_contexts)) if synchronous: results = [] for async_result in async_results: try: results.append(async_result.get(timeout=timeout)) finally: async_result.forget() return results return async_results
def create_like(self, conn, cursor, table, like_table, drop=False): """Create a table like another table, optionally trying to drop `table` first""" table = escape_string(str(table).strip("`")) like_table = escape_string(str(like_table).strip("`")) if drop: drop_sql = "drop table if exists %s" % table dbg(drop_sql) self.execute(conn, cursor, drop_sql) if isinstance(conn, sqlite3.Connection): get_create_sql = ( "SELECT sql FROM sqlite_master WHERE type='table' AND name=?") qr = self.execute(conn, cursor, get_create_sql, params=(like_table, )) row = qr.fetchone() raiseifnot(isinstance(row, sqlite3.Row), "Only sqlite3.Row rows are supported") create_sql = row["sql"].replace(like_table, table) else: # Assume this syntax works with most other SQL databases create_sql = "create table %s like %s" % (table, like_table) dbg(create_sql) self.execute(conn, cursor, create_sql)
def consume(self, data=None, cleanup=None, split_count=None, synchronous=False, timeout=None, **node_contexts): """Setup node contexts and consume data with the pipeline Parameters ---------- data : iterable, optional Iterable of data to consume cleanup : dict, optional A mapping of arg names to clean up functions to be run after data processing is complete. split_count : int, optional How many slices to split the data into for parallel processing. Default is the number of workers in the provided queue. synchronous : bool, optional If False, return Jobs. If True, wait for jobs to complete and return their results, if any. timeout : int or float, optional If waiting for results, raise an exception if polling for all results takes longer than timeout seconds. **node_contexts Keyword arguments that are node_name->param_dict """ if not split_count: dbg("determining split count from rq worker count") workers = Worker.all(queue=self.queue) split_count = len(workers) split_count = split_count_helper(data, split_count) if data is None: splits = [None for s in range(split_count)] else: splits = divide_data(data, split_count) dbg("%s: data len: %s, splits: %d" % (self.__class__.__name__, size(data, "n/a"), split_count)) async_results = [] for split in splits: async_results.append( self.queue.enqueue( rq_consume, args=(self.pipeline, split), kwargs=dict(cleanup=cleanup, **node_contexts), )) if synchronous: return get_async_results(async_results, timeout=timeout) return async_results
def end(self): """Do the push once all Futures results are in""" dbg("Waiting for %d futures..." % len(self.results)) timeout = self.context.get("timeout", None) results = [] for future in as_completed(self.results, timeout=timeout): results.append(future.result()) if results and self.context.get("flatten", False): results = flatten(results) self.push(results)
def run( self, rows, conn, cursor=None, schema=None, commit=True, rollback=False, dry_run=False, ): """Create and bulk load a temp table Parameters ---------- rows Iterable of rows to load to the table conn Database connection cursor : optional Database connection cursor schema : str, optional Schema to create temp table in commit : bool, optional If true try to commit the transaction. If your connection autocommits this will have no effect. If this is a SQLAlchemy connection and you are in a transaction, it will try to get a reference to the current transaction and call commit on that. rollback : bool, optional If true try to rollback the transaction on exceptions. Behavior may vary by backend DB library if you are not currently in a transaction. dry_run : bool, optional If true, skip actually loading the data """ table = get_temp_table(conn, rows, create=True, schema=schema) sql = self.get_bulk_statement(conn, "REPLACE", table.name, rows) dbg("Loading %d rows\n%s" % (size(rows, "n/a"), sqlformat(sql)), indent="label") if dry_run: warn("dry_run=True, skipping load in %s.run" % self.__class__.__name__) else: if not cursor: cursor = self.get_sql_executor(conn) try: self.executemany(conn, cursor, sql, rows) if commit: self.commit(conn) except: if rollback: self.rollback(conn) raise self.push(table.name)
def process(self, data): """Required method used by Consecution to process nodes""" arg_values, kwarg_values = self._get_run_arg_values() if self._log: print(format_msg(repr(data), label=self.name)) else: dbg("size:%s %s" % (size(data), repr(data)), label=self.name) if self.run_requires_data: self._run(data, *arg_values, **kwarg_values) else: self._run(*arg_values, **kwarg_values)
def end(self): """Do the push once all results are in""" dbg("Waiting for %d celery task(s)..." % len(self.results)) result_set = ResultSet(self.results) results = result_set.get( timeout=self.context.get("timeout", None), propagate=self.context.get("propagate", True), interval=self.context.get("interval", 0.5), ) result_set.forget() if results and self.context.get("flatten", False): results = flatten(results) self.push(results)
def end(self): """Do the push once all Futures results are in. Warnings -------- Dask futures will not work if you have closed your client connection! """ dbg("Waiting for %d Dask futures..." % len(self.results)) results = [] for _, result in dask_as_completed(self.results, with_results=True): results.append(result) if results and self.context.get("flatten", False): results = pd.concat(results) self.push(results)
def get_async_results(async_results, timeout=None): """Poll for results """ # TODO: Is there a better option than polling? start = time.time() while complete_count(async_results) < len(async_results): diff = time.time() - start if timeout and diff >= timeout: raise RQTimeoutException( "get_async_results timed out after %.3fs" % diff) dbg("Sleeping %.3fs..." % POLL_SLEEP) time.sleep(POLL_SLEEP) return [job.result for job in async_results]
def transaction(self, conn, cursor=None): """Start a transaction. If conn is a SQLAlchemy conn return a reference to the transaction object, otherwise just return the conn which should have commit/rollback methods.""" dbg("starting transaction: %s" % conn) if is_sqlalchemy_conn(conn): return conn.begin() # For SQLite and DBAPI connections we explicitly call begin. # https://docs.python.org/3/library/sqlite3.html#sqlite3-controlling-transactions if not cursor: cursor = self.get_sql_executor(conn) cursor.execute("BEGIN") return conn
def rollback(self, obj): """Rollback any currently active transactions""" dbg("rolling back transaction: %s" % obj) if hasattr(obj, "rollback"): obj.rollback() elif is_sqlalchemy_conn(obj): # See note above about this hack raiseifnot( hasattr(obj, "_Connection__transaction"), "Could not find transaction attribute on SQLAlchemy object: %s" % obj, ) if getattr(obj, "_Connection__transaction", None): obj._Connection__transaction.rollback() else: raise AssertionError( "Trying to rollback a transaction but the SQLAlchemy " "conn was not in a transaction. It may have " "autocommitted.") else: raise AssertionError( "Could not determine how to rollback with object: %s" % obj)
def consume(pipeline, data, cleanup=None, **node_contexts): """Handles node contexts before/after calling pipeline.consume() Note ---- It would have been better to subclass Pipeline and implement this logic right before/after the core consume() call, but there is a bug in pickle that prevents that from working with multiprocessing. """ update_node_contexts(pipeline, node_contexts) try: contexts = get_node_contexts(pipeline) dbg("size=%s\n%s" % (size(data, "n/a"), pf(contexts)), indent="label") try: if data is None: return consume_none(pipeline) else: return pipeline.consume(iterize(data)) finally: if cleanup: clean_up_nodes(cleanup, contexts) finally: reset_node_contexts(pipeline, node_contexts)
def clean_up_nodes(cleanup, contexts): """Call clean up functions for node context objects""" errors = [] cleaned = set() # This block will clean any arg names that match regardless of node name removes = set() for node_name, context in contexts.items(): for arg_name, arg_value in context.items(): if arg_name in cleanup: cleaned.add((node_name, arg_name)) removes.add(arg_name) func = cleanup[arg_name] try: func(arg_value) except Exception as e: dbg("Exception during clean up: %s" % str(e)) for key in removes: del cleanup[key] # This block handles specific node_name/arg_name pairs for key, func in cleanup.items(): parts = key.split("_") node_name = parts[0] arg_name = "_".join(parts[1:]) if node_name not in contexts: errors.append("Could not clean up %s, invalid node name: %s" % (key, node_name)) continue if arg_name not in contexts[node_name]: errors.append( "Could not clean up %s, invalid node arg name: %s->%s" % (key, node_name, arg_name)) continue if (node_name, arg_name) in cleaned: dbg("Skipping clean up for %s->%s, already cleaned" % (node_name, arg_name)) continue ctx_value = contexts[node_name][arg_name] if not ctx_value: dbg("Skipping clean up for %s->%s, value is blank" % (node_name, arg_name)) continue if isinstance(ctx_value, RuntimeContext): dbg("Skipping clean up for %s->%s, value is RuntimeContext object" % (node_name, arg_name)) continue dbg("Executing clean up for %s->%s" % (node_name, arg_name)) try: func(ctx_value) except Exception as e: errors.append("Failed to clean up %s->%s: %s" % (node_name, arg_name, str(e))) if errors: raise Exception("Errors during clean_up: %s" % errors)
def rename_tables(self, conn, cursor, renames): """Execute one or more table renames""" for t1, t2 in renames: sql = escape_string("ALTER TABLE %s RENAME TO %s" % (t1, t2)) dbg(sql) self.execute(conn, cursor, sql)
def run( self, data, frm=None, to=None, subject=None, body=None, html=None, attach_as="attachment", attachment_name=None, formatter=None, client=None, host=None, port=None, username=None, password=None, dry_run=False, ): """Load data to email via SMTP. Parameters ---------- data EmailMessage or data to send. If the latter, the message will be created from the other node arguments. frm : str, optional The from email address to : str or list, optional A str or list of destination email addresses subject : str, optional The email subject body : str, optional The email text body html : str, optional The email html body attach_as : str Where to put the data in the email message if building the message from node arguments. Options: attachment, body, html. attachment_name: str, optional The file name to write the data to when attaching data to the email. The file extension will be used to infer the mimetype of the attachment. This should not be a full path as a temp directory will be created for this. formatter : callable A function to format and return a string from the input data if attach_as is set to "body" or "html". client : optional A connected smtplib.SMTP client host : str, optional The SMTP host to connect to if no client is provided port : int, optional The SMTP port to connect to if no client is provided username : str, optional The SMTP username for login if no client is provided password : str, optional The SMTP password for login if no client is provided dry_run : bool, optional If true, skip actually loading the data """ if isinstance(data, EmailMessage): msg = data else: # Assume its data that needs to be converted to attachments and sent raiseifnot( frm and to and subject, "Node context must have frm/to/subject set to create an email msg", ) raiseifnot( isinstance(data, str), "data must be passed as raw str content, got %s" % type(data), ) attachments = None tmpdir = None if attach_as == "attachment": raiseifnot( attachment_name, "Must specify an attachment_name when attach_as = attachment", ) tmpdir = tempfile.TemporaryDirectory() filename = tmpdir.name + "/" + attachment_name with open(filename, "w") as f: f.write(data) attachments = [filename] else: fmt_data = formatter(data) if formatter else data if attach_as == "body": body = (body or "") + fmt_data elif attach_as == "html": html = (html or "") + fmt_data else: raise AssertionError( "Invalid attach_as value: %s, options: attachment, body, html" % attach_as ) msg = create_email( frm, to, subject, body=body, html=html, attachments=attachments ) if tmpdir: tmpdir.cleanup() if dry_run: warn("dry_run=True, skipping load in %s.run" % self.__class__.__name__) else: dbg("Sending msg %s to %s" % (msg["Subject"], msg["To"])) send_email( msg, client=client, host=host, port=port, username=username, password=password, ) self.push(data)
def drop_table(self, conn, cursor, table): """Drop tables all day long""" drop_sql = escape_string("drop table %s" % table) dbg(drop_sql) self.execute(conn, cursor, drop_sql)
def run(self, criteria, sort=None, folder="INBOX", client=None, host=None, username=None, password=None, push_all=False, push_type="message", limit=None, **kwargs): """Extract data from an email inbox and push the data forward. Note ---- Instances of IMAPClient are NOT thread safe. They should not be shared and accessed concurrently from multiple threads. Parameters ---------- criteria : str or list Criteria argument passed to IMAPClient.search. See https://tools.ietf.org/html/rfc3501.html#section-6.4.4. sort : str or list, optional Sort criteria passed to IMAPClient.sort. Note that SORT is an extension to the IMAP4 standard so it may not be supported by all IMAP servers. See https://tools.ietf.org/html/rfc5256. folder : str, optional Folder to read emails from client : optional An established IMAPClient connection. If not present, the host/login information is required. host : str, optional The IMAP host to connect to username : str, optional The IMAP username for login password : str, optional The IMAP password for login push_all : bool, optional When true push all retrievd data/emails at once push_type : str, optional What type of data to extract and push from the emails. Options include: * **message**: push email.message.EmailMessage objects * **message_id**: push a list of message IDs that can be fetched * **all**: push a list of dict(message=<email.message.EmailMessages>, payload=<extracted payload>) * **body**: push a list of email bodies * **attachment**: push a list of attachments (an email with multiple attachments will be grouped in a sublist) limit : int, optional Limit to N rows **kwargs Keyword arguments to pass IMAPClient if not client is passed """ data = [] logout = False push_types = ["message_id", "message", "all", "body", "attachment"] if not client: raiseifnot( host and username and password, "Host/Username/Password required to create IMAPClient", ) dbg("Logging into IMAPClient %s/%s" % (host, username)) logout = True client = IMAPClient(host, **kwargs) client.login(username, password) try: client.select_folder(folder) if sort: messages = client.sort(sort, criteria=criteria) else: messages = client.search(criteria) dbg("Found %d email messages" % len(messages)) if push_type == "message_id": if limit: data = messages[:limit] else: data = messages else: raiseifnot( push_type in push_types, "Unrecognized push_type: %s, options: %s" % (push_type, push_types), ) count = 0 for msg_id, msg_data in client.fetch(messages, ["RFC822"]).items(): raw = msg_data[b"RFC822"].decode("utf8") msg = parser.Parser(policy=policy.default).parsestr(raw) if push_type == "message": data.append(msg) else: payload = extract_email_payload(msg) if push_type == "body": data.append(payload[0]) elif push_type == "attachment": data.append(payload[1:]) elif push_type == "all": data.append(dict(message=msg, payload=payload)) count += 1 if limit and count >= limit: break finally: if logout: client.logout() if push_all: self.push(data) else: for row in data: self.push(row)
def run( self, rows, conn, table, cursor=None, commit=True, rollback=False, stmt_type="REPLACE", odku=False, swap=False, keep_old=False, push_data=False, dry_run=False, ): """Form SQL statement and use bulk execute to write rows to table Parameters ---------- rows Iterable of rows to load to the table conn Database connection table : str Name of a table to write the data to cursor : optional Database connection cursor commit : bool, optional If true try to commit the transaction. If your connection autocommits this will have no effect. If this is a SQLAlchemy connection and you are in a transaction, it will try to get a reference to the current transaction and call commit on that. rollback : bool, optional If true try to rollback the transaction on exceptions. Behavior may vary by backend DB library if you are not currently in a transaction. stmt_type : str, optional Type of SQL statement to use (REPLACE, INSERT, etc.). **Note:** Backend support for this varies. odku : bool or list, optional If true, add ON DUPLICATE KEY UPDATE clause for all columns. If a list then only add it for the specified columns. **Note:** Backend support for this varies. swap : bool, optional If true, load a table and then swap it into the target table via rename. Not supported with all database back ends. keep_old : bool, optional If true and swapping tables, keep the original table with a __old suffix added to the name push_data : bool, optional If true, push the data forward instead of the table name dry_run : bool, optional If true, skip actually loading the data """ load_table = table if swap: load_table = add_table_suffix(table, "__swap") sql = self.get_bulk_statement(conn, stmt_type, load_table, rows, odku=odku) dbg("Loading %d rows\n%s" % (size(rows, "n/a"), sqlformat(sql)), indent="label") if dry_run: warn("dry_run=True, skipping load in %s.run" % self.__class__.__name__) else: if not cursor: cursor = self.get_sql_executor(conn) try: if swap: self.create_like(conn, cursor, load_table, table, drop=True) self.executemany(conn, cursor, sql, rows) if swap: old_table = add_table_suffix(table, "__old") self.rename_tables( conn, cursor, [(table, old_table), (load_table, table)] ) if not keep_old: self.drop_table(conn, cursor, old_table) if commit: self.commit(conn) except: if rollback: self.rollback(conn) raise if push_data: self.push(rows) else: self.push(table)