def save_benchmark_timings(path, tablename, table_cache, instance_id: int): fn = os.path.join(path, "benchmark.bench") if not os.path.exists(fn): P.get_logger().warn( "file {} does not exist, no tool timings uploaded".format( fn)) else: table = pandas.read_csv(fn, sep="\t") table["instance_id"] = instance_id table_cache.add_table(table, tablename)
def __init__(self, database_url, schema): self.database_url = database_url self.schema = schema self.cache = {} self.total_size = 0 self.sizes = collections.defaultdict(int) self.uploaded_sizes = collections.defaultdict(int) self.dtypes = {} self.logger = P.get_logger() self.indices = {} self.have_created_indices = False
def check_unique(tool_functions, input_combos=None, input_regex=None, input_alias=None, is_test=False): # compute a list of task names names = [] if input_combos: for toolf, input_files in itertools.product(tool_functions, input_combos): taskf = copy.copy(toolf) taskf.register_input(input_files, regex=input_regex, alias=input_alias, is_test=is_test) names.append(taskf.__name__) else: for toolf in tool_functions: taskf = copy.copy(toolf) taskf.register_input(regex=input_regex, alias=input_alias, is_test=is_test) names.append(taskf.__name__) counts = collections.Counter(names) for name, count in list(counts.items()): if count > 1: make_unique = True P.get_logger().debug( "adding hash identifier because of duplicate name: {}={}". format(name, count)) break else: make_unique = False return make_unique
def add_columns_to_table(columns, table, tablename, engine): pandas_engine = pandas.io.sql.SQLDatabase(engine) pandas_table = pandas.io.sql.SQLTable(tablename, pandas_engine, frame=table) new_columns = set(columns) logger = P.get_logger() for column in pandas_table.table.columns: if column.name in new_columns: statement = "ALTER TABLE {} ADD COLUMN {} {}".format( tablename, column.name, column.type) logger.debug("SQL: {}".format(statement)) try: retry_sql_execute(engine, statement) except (sqlalchemy.exc.OperationalError, sqlite3.OperationalError) as ex: if "duplicate column name" not in str(ex): raise
def upload_metrics_tables(infiles: list, run_id: int, schema, url: str, max_workers: int = 10): logger = P.get_logger() engine = create_engine(url) Session = sessionmaker(bind=engine) session = Session() logger.info(f"{os.getpid()}: collecting upload items for {len(infiles)} input files") metric_f = generate_metric pool = multiprocessing.Pool(max_workers) metrics = pool.map(metric_f, infiles) pool.close() pool.join() logger.info(f"{os.getpid()}: instantiating {len(metrics)} metrics") data = list(tqdm.tqdm(instantiate_metrics(metrics, session, run_id), total=len(metrics))) logger.info(f"{os.getpid()}: uploading {len(data)} items") upload_f = upload_metric initargs = (upload_f, url, schema) if max_workers == 1: setup_worker(*initargs) result = list(map(upload_f, data)) global resource resource.table_cache.flush_all() else: logger.info(f"{os.getpid()}: loading data with {max_workers} cores") pool = multiprocessing.Pool(max_workers, initializer=setup_worker, initargs=initargs) pool.map(upload_f, data) pool.close() pool.join()
def reconcile_columns(tablename, engine, table): logger = P.get_logger() existing_columns = set(get_columns(tablename, engine)) proposed_columns = set(table.columns) obsolete_columns = existing_columns.difference(proposed_columns) if obsolete_columns: logger.warn("the following columns are obsolete in {}: {}. " "empty data will be inserted".format( tablename, ", ".join(obsolete_columns))) # create empty columns for column in obsolete_columns: table[column] = None new_columns = proposed_columns.difference(existing_columns) if new_columns: logger.warn("new columns found for {}: the following columns " "will be added: {} ".format(tablename, ", ".join(new_columns))) add_columns_to_table(new_columns, table, tablename, engine) # clear cache of memoization function get_columns.delete(tablename, engine)
def purge_run_id(run_id, url, dry_run=False, schemas=None): """remove a run from a database. """ logger = P.get_logger() engine = sqlalchemy.create_engine(url) connection = engine.connect() # automap metadata = sqlalchemy.MetaData() metadata.reflect(engine) base = automap_base(metadata=metadata) base.prepare() if schemas is None: insp = inspect(engine) schemas = insp.get_schema_names() # note: default sqlite schema is "main" if 'public' in schemas: schemas.remove('public') if 'information_schema' in schemas: schemas.remove('information_schema') logger.debug("getting instance_id list of run_id={}".format(run_id)) instance_ids = set(get_instance_ids_for_run_id(run_id, engine)) logger.debug("found {} instances for run_id={}".format(len(instance_ids), run_id)) non_metric_tables = ['run', 'instance', 'binary_data', 'metric_timings', 'tool_timings', 'metric_storage', 'tags'] # delete from tables with field "instance_id" if instance_ids: for schema in schemas: # automap the schema metadata_schema = sqlalchemy.MetaData() metadata_schema.reflect(engine, schema=schema) base_schema = automap_base(metadata=metadata_schema) base_schema.prepare() for table_name in list(base_schema.metadata.tables.keys()): table = sqlalchemy.Table(table_name, metadata_schema, autoload=True) if "instance_id" not in table.c: continue logger.debug("deleting data in {}".format(table_name)) delete = table.delete().where( table.c.instance_id.in_(instance_ids)) if not dry_run: connection.execute(delete) # delete from tables with field "run_id" for table_name in base.metadata.tables.keys(): table = sqlalchemy.Table(table_name, metadata, autoload=True) if "run_id" not in table.c: continue logger.info("deleting data in {} for run_id {}".format(table_name, run_id)) delete = table.delete().where(table.c.run_id == run_id) if not dry_run: connection.execute(delete) table = sqlalchemy.Table('run', metadata, autoload=True) delete = table.delete().where(table.c.id == run_id) logger.info("deleting data in 'run' for id {}".format(run_id)) if not dry_run: connection.execute(delete)
def upload_result(infiles, outfile, *extras): """upload results into database. Connection details for the database are taken from the configuration dictionary given as first argument to extras. The configuration directory should have an element 'database' with the required field ``url`` and the optional field ``schema``. For example, to upload to an sqlite database in the current directory called csvdb, use:: config = {"database": {"url": "sqlite:///./csvdb"}} To use multiple cores, try:: config = {"database": {"url": "sqlite:///./csvdb", "cores": 10}} Arguments --------- infiles: list List of files to upload. These should be the output of metric tasks in a benchmarking workflow. outfile: output file On success, an empty output file is created. extras: list List of one element containing a configuration directory (see above). """ logger = P.get_logger() if len(extras) != 1: raise ValueError( "expecting only one extra argument " "(configuration dictionary)") config = extras[0] url = config["database"]["url"] max_workers = config["database"].get("cores", 1) schema = config["database"].get("schema", None) # TODO: check if schema exists to avoid incomplete # transaction. engine = create_engine(url) # Catch exceptions until database access on thame available try: create_database(engine) except OperationalError as msg: logger.warn( "could not connect to database at {}. " "The data will not be uploaded. Msg={}".format( url, str(msg))) return # Create schema if not exists if schema is not None: engine.execute( str(text("CREATE SCHEMA IF NOT EXISTS {}".format(schema)))) pipeline_name = os.path.basename(sys.argv[0]) logger.debug("uploading data to {}, schema={}".format(url, schema)) # TODO: add dependencies # dependencies = infiles[1:] # meta_data = dict([("dependency{}".format(x), y) \ # for x, y in enumerate(dependencies)]) # need to set created dir somehow, important when re-loading # as otherwise all times will be the same. if os.path.exists("benchmark.yml"): s = os.stat("benchmark.yml") created = datetime.datetime.fromtimestamp(s.st_mtime) else: created = datetime.datetime.now() Session = sessionmaker(bind=engine) session = Session() benchmark_run = BenchmarkRun( author=os.environ.get("USER", "unknown"), # needs refactoring, should be: uploaded_at, created_at, run_at # uploaded_at=datetime.datetime.now(), created=created, pipeline_name=pipeline_name, pipeline_version=P.get_version().version, pipeline_dir=os.getcwd(), title=config["title"], description=config["description"], config=json.dumps(config), config_hash=hash(json.dumps(config)), status="incomplete") session.add(benchmark_run) session.commit() for tag in config["tags"]: benchmark_tag = BenchmarkTag(run_id=benchmark_run.id, tag=tag) session.add(benchmark_tag) session.commit() engine.dispose() del engine upload_metrics_tables(infiles, benchmark_run.id, schema, url, max_workers=max_workers) # upload table sizes # df_sizes = pandas.DataFrame.from_records(list(table_cache.uploaded_sizes.items()), # columns=["tablename", "bytes_uploaded"]) # df_sizes["bytes_resident"] = df_sizes.bytes_uploaded # df_sizes["run_id"] = benchmark_run.id # df_sizes["schema"] = schema # save_table(df_sizes, # engine, # "metric_storage", # schema=None, # is_sqlite3=is_sqlite3) mark_upload_complete(url, benchmark_run.id) logger.info("uploaded results under run_id {}".format(benchmark_run.id)) touch(outfile)
def save_metric_data(meta_data, table_cache, schema, instance_id: int, session): logger = P.get_logger() metric_table_filter = None if "metric_no_upload" in meta_data: if meta_data["metric_no_upload"] == "*": logger.warn("upload turned off for metric {}".format( meta_data["metric_name"])) return else: metric_table_filter = re.compile(meta_data["metric_no_upload"]) # multiple tablenames for multiple metric output # # Tables are added into schemas to avoid cluttering # the public namespace. # (if only blobs, no metric output file) if "metric_output_files" in meta_data: assert len(meta_data["metric_output_files"]) == \ len(meta_data["metric_tablenames"]) for output_file, tablename in zip( meta_data["metric_output_files"], meta_data["metric_tablenames"]): if metric_table_filter and metric_table_filter.search(tablename): logger.warn("upload for table {} turned off".format( tablename)) continue if not os.path.exists(output_file): logger.warning("output file {} does not exist - ignored".format( output_file)) continue if IOTools.is_empty(output_file): logger.warn("output file {} is empty - ignored".format( output_file)) continue # table = pandas.DataFrame({"values": [1, 2]}) try: table = pandas.read_csv(output_file, sep="\t", comment="#", skip_blank_lines=True) except ValueError as e: logger.warn("table {} can not be read: {}".format( output_file, str(e))) continue except pandas.parser.CParserError as e: logger.warn("malformatted table {} can not be read: {}".format( output_file, str(e))) continue if table.empty: logger.warn("table {} is empty - ignored".format(output_file)) continue tablename, table, dtypes = transform_table_before_upload(tablename, table, instance_id, meta_data, table_cache) if schema is None: tn = tablename else: tn = "{}.{}".format(schema, tablename) # add foreign key table["instance_id"] = instance_id logger.debug(f"saving data {table.shape} from {output_file} to table {tn} under {instance_id}") table_cache.add_table(table, tablename, dtypes) if "metric_blob_globs" in meta_data: metric_dir = meta_data["metric_outdir"] files = [glob.glob(os.path.join(metric_dir, x)) for x in meta_data["metric_blob_globs"]] files = IOTools.flatten(files) logger.debug( "uploading binary data in {} files from {} to " "table binary_data".format(len(files), metric_dir)) table = [] for fn in files: with IOTools.open_file(fn, "rb", encoding=None) as inf: data_row = BenchmarkBinaryData( instance_id=instance_id, filename=os.path.basename(fn), path=fn, data=inf.read()) session.add(data_row) session.commit() if meta_data.get("metric_tableindices", None): table_cache.add_indices(meta_data["metric_tableindices"])
def transform_table_before_upload(tablename, table, instance_id: int, meta_data, table_cache): dtypes = None logger = P.get_logger() # melt table if set by metric if "metric_upload_melted" in meta_data: if tablename in meta_data["metric_upload_melted"]: melt_data = meta_data["metric_upload_melted"][tablename] table = pandas.melt( table, id_vars=melt_data.get("id_vars", None), value_vars=melt_data.get("value_vars", None), var_name=melt_data.get("var_name", None), value_name=melt_data.get("value_name", None)) logger.debug("melted data from table {}".format(tablename)) if "metric_upload_transpose" in meta_data: if tablename in meta_data["metric_upload_transpose"]: table = table.transpose() # upload into a separate table suffixed by instance id if "metric_upload_separate" in meta_data: if tablename in meta_data["metric_upload_separate"]: tablename = "{}_{}".format(tablename, instance_id) # normalize table by factorizing a column and storing its ids # in a separate table if "metric_upload_normalize" in meta_data: if tablename in meta_data["metric_upload_normalize"]: for column in meta_data["metric_upload_normalize"][tablename]: if column not in table.columns: raise ValueError( "unknown column {} in table {} to be normalized".format( tablename, column)) factors, names = table[column].factorize() table[column] = factors table.rename(columns={column: column + "_id"}, inplace=True) factor_table = pandas.DataFrame( {column: names, "id": list(range(len(names)))}) factor_table["instance_id"] = instance_id table_cache.add_table(factor_table, tablename + "_factors") # store table as a matrix if "metric_upload_as_matrix" in meta_data: if tablename in meta_data["metric_upload_as_matrix"]: groupby_columns = meta_data["metric_upload_as_matrix"][tablename] if not isinstance(groupby_columns, list): groupby_columns = [groupby_columns] take_columns = [x for x in table.columns if x not in groupby_columns] row_index_column = take_columns.pop(0) rows = [] if not groupby_columns: matrix = table.as_matrix(take_columns) rows.append([",".join(map(str, table[row_index_column])), ",".join(map(str, take_columns)), str(matrix.dtype), matrix.tostring()]) else: for key, group in table.groupby(by=groupby_columns): if not isinstance(key, tuple): key = [key] matrix = group.as_matrix(take_columns) rows.append(list(key) + [",".join(map(str, group[row_index_column])), ",".join(map(str, take_columns)), str(matrix.dtype), matrix.tostring()]) table = pandas.DataFrame.from_records( rows, columns=groupby_columns + ["rows", "columns", "dtype", "data"]) dtypes = {"data": LargeBinary} return tablename, table, dtypes
def add_metrics_to_pipeline(pipeline, metrics, map_metric_to_runner, tool_runners, suffix="tsv", prefix=None, config=None, **kwargs): single_input_metric_functions = [] for metric in metrics: metricc = map_metric_to_runner[metric.strip()] if metricc.name in config: conf = config[metricc.name] else: conf = {} conf = expand_generators(conf) configurations = build_combinations(conf) for configuration in configurations: single_input_metric_functions.append(metricc(**configuration)) make_unique = check_unique(single_input_metric_functions, input_combos=None, input_regex=None, input_alias=None) metric_runners = [] for taskf in single_input_metric_functions: ignore = config.get(taskf.name, {}).get("ignore", []) taskf.register_input(make_unique=make_unique) unique_name = taskf.__name__ # make task name unique by adding 'prefix' as this method might # be called multiple times for straight, collated and split tasks if prefix: taskf.__name__ = prefix + taskf.__name__ filter_regex = ruffus.regex("(.*)/(.*).{}".format(suffix)) result_dir = os.path.join(unique_name + ".dir") output = r"\1/{}/{}.tsv".format(result_dir, taskf.name) found = False # Note that ignore will only work on the static parts of a task # as result_dir contains a pattern that will be filled in at runtime, # e.g. \1/echidna_test.dir/echidna_test.tsv. for i in ignore: if i in result_dir: P.get_logger().warn("the following task will be ignored: " "{} matching {}".format(result_dir, i)) found = True if found: continue metric_task = pipeline.transform(task_func=taskf, input=tool_runners, filter=filter_regex, output=output, **kwargs) metric_runners.append(metric_task) f = EmptyRunner() if prefix: f.__name__ = prefix + "metrics" else: f.__name__ = "metrics" pipeline.merge(task_func=f, input=metric_runners, output=None) return metric_runners
def add_collations_to_pipeline(pipeline, map_tool_to_runner, collations, tasks=None, config=None, **kwargs): runners = [] ignore = config["setup"].get("ignore", []) ignore.extend(config["input"].get("ignore", [])) for coll in collations: if coll not in config: raise KeyError( "configuration file requires a section for '{}'".format(coll)) coll_info = config[coll] for keyword in ("runner", "regex_in", "pattern_out"): if keyword not in coll_info: raise ValueError( "section {} is missing required keyword '{}'".format( coll, keyword)) runner_options = config.get(coll_info["runner"], {}) runner_name = runner_options.get("name", coll_info["runner"]).strip() colcc = map_tool_to_runner[runner_name] taskf = colcc(**runner_options) # automatically set alias through regex (required field) taskf._input_regex = coll_info.get("regex", None) taskf._input_alias = coll_info.get("alias", None) taskf._replicate_regex = coll_info.get("regex_replicate", None) taskf.__name__ = coll if tasks is not None: input_tasks = tasks elif "glob" in coll_info: input_tasks = coll_info["glob"] else: raise ValueError("need either tasks or glob expression " "for collation") filter_regex = ruffus.regex(coll_info["regex_in"]) filter_regex = ruffus.regex(coll_info["regex_in"]) result_dir = os.path.join(coll + ".dir") output_pattern = coll_info["pattern_out"] output_prefix = r"{}/{}".format(result_dir, output_pattern) output_dir = os.path.dirname(output_prefix) if hasattr(taskf, "output"): output, multiple_outputs, flexible_outputs, _suffix = \ build_output(taskf, output_dir) else: multiple_outputs = False output = output_prefix found = False for i in IOTools.val2list(ignore): if i in result_dir: P.get_logger().warn("the following task will be ignored: " "{} matching {}".format(result_dir, i)) found = True if found: continue metric_task = pipeline.collate(task_func=taskf, input=input_tasks, filter=filter_regex, output=output, **kwargs).mkdir(input_tasks, filter_regex, output_dir) if multiple_outputs: f = EmptyRunner() f.__name__ = taskf.__name__ + "_passthrough" output = [re.sub(r"\\\d+", "*", x) for x in output] metric_task = pipeline.split(task_func=f, input=metric_task, output=output) runners.append(metric_task) return runners
def add_tools_to_pipeline(pipeline, map_tool_to_runner, config=None, input_files=None, **kwargs): """add tools to a workflow pipeline. This function adds for each input and tool combination a task to the workflow. The configuration dictionary should contain the following sections: input: Configuration of input files. Key/value pairs and possibly hierarchical. The following keys are optional: regex alias group_regex group_alias tool: A list of tools to apply. A typical configuration dictionary might look like this:: {"input": {"bam": "*.bam"}, "tool": ["bwa_mem", "isaac"]} Arguments --------- pipeline : object The ruffus pipeline that tasks will be added to. map_tool_to_runner: dict Dictionary mapping tools to functions in the :ref:`tasks`. config: dict Configuration dictionary. input_files: list List of (optional) input files. """ tool_functions = build_tool_functions(map_tool_to_runner, config) if "input" not in config: raise KeyError("configuration file requires an 'input' section") if config["input"] is None: raise ValueError("input section is empty") input_regex = config["input"].pop("regex", None) input_alias = config["input"].pop("alias", None) replicate_alias = config["input"].pop("replicate_alias", None) input_group_regex = config["input"].pop("group_regex", None) input_group_alias = config["input"].pop("group_alias", "\\1") ignore = config["setup"].get("ignore", []) ignore.extend(config["input"].get("ignore", [])) do_replication = config["setup"].pop("replication", None) if do_replication: replications = int(do_replication) P.get_logger().info( "running experiment with {} replications".format(replications)) else: replications = 1 # update selected fields for testing purposes is_test = "is_test" in config if "test" in config["input"]: config["input"].update(config["input"]["test"]) del config["input"]["test"] # build input/tool combinations, optionally grouping them config_files = expand_globs(config["input"], is_test=is_test) if input_group_regex: config_files = group_files(config_files, input_group_regex, input_group_alias) input_combos = build_combinations(config_files) tool_runners = [] make_unique = check_unique(tool_functions, input_combos=input_combos, input_regex=input_regex, input_alias=input_alias, is_test=is_test) suffix = None for toolf, input_files in itertools.product(tool_functions, input_combos): for replication_idx in range(replications): # create a copy of the task function and give it its unique name # by mangling it with the input_files taskf = copy.copy(toolf) if do_replication: taskf.set_replication_id(replication_idx + 1) taskf.register_input(input_files, regex=input_regex, alias=input_alias, make_unique=make_unique, is_test=is_test, replicate_alias=replicate_alias) if "name" in input_files: # create copy of input_files without name, do # not modify original as different tools require # the 'name' input_files = dict([(x, y) for x, y in list(input_files.items()) if x != "name"]) result_dir = os.path.join(taskf.__name__ + ".dir") found = False for i in IOTools.val2list(ignore): if i in result_dir: P.get_logger().warn("the following task will be ignored: " "{} matching {}".format(result_dir, i)) found = True if found: continue output, multiple_outputs, flexible_outputs, _suffix = \ build_output(taskf, result_dir) if suffix is None: suffix = _suffix elif suffix != _suffix: raise ValueError( "tools produce output files of different type, " "got {}, expected {}".format(_suffix, suffix)) tool_task = pipeline.merge(task_func=taskf, input=list(input_files.values()), output=output, **kwargs).mkdir(result_dir) # if there are multilpe output files, split the task so that # each output file will be processed separately further down the # pipeline. if multiple_outputs: f = EmptyRunner() f.__name__ = taskf.__name__ + "_split" tool_task = pipeline.split(task_func=f, input=tool_task, output=output) tool_runners.append(tool_task) # convenience target f = EmptyRunner() f.__name__ = "tools" pipeline.merge(task_func=f, input=tool_runners, output=None) return suffix, tool_runners
def save_table(table: pandas.DataFrame, url: str, tablename: str, schema: str = None, dtypes=None, indices=["instance_id"]): logger = P.get_logger() table.columns = sql_sanitize_columns(table.columns) engine = create_engine(url) # pandas/sqlite3 prefers the raw connection, otherwise error: # AttributeError: 'Engine' object has no attribute 'rollback' if url.startswith("sqlite"): _engine = engine.raw_connection() # In pandas >= 0.23 and using sqlite as a backend, the # pandas.DataFrame.to_sql command fails with "OperationalError: # (sqlite3.OperationalError) too many SQL variables". The reason is a # fixed limit in sqlite, SQLITE_MAX_VARIABLE_NUMBER, which is by # default set to 999. sql_chunk_size = 999 // (len(table.columns) + 1) else: _engine = engine sql_chunk_size = None # lower case all table names. Otherwise issues with psql # mixed case access tablename = tablename.lower() create_index = False try: retry_table_to_sql(table, tablename, _engine, schema=schema, if_exists="fail", index=False, dtype=dtypes, chunksize=sql_chunk_size) E.debug(f"table {tablename} was new") create_index = True except TableExistsException: E.debug(f"table {tablename} already exists - appending") if create_index: # sqlite requires an index name if schema: tablename = "{}.{}".format(schema, tablename) for field in indices: E.debug(f"creating index on {field} for {tablename}") try: retry_sql_execute( _engine, str( text("CREATE INDEX {} ON {} ({})".format( re.sub("[-.]", "_", tablename) + "_" + field, tablename, field)))) except IndexExistsException: pass except TypeError as ex: logger.warn("could not create index: {}".format(str(ex))) except sqlalchemy.exc.ProgrammingError as ex: logger.warn("could not create index: {}".format(str(ex))) else: reconcile_columns(tablename, engine, table) retry_table_to_sql(table, tablename, _engine, schema=schema, if_exists="append", index=False, dtype=dtypes, chunksize=sql_chunk_size)