def run(self, infile, outfile, params): if params.reference_fasta_map is None: raise ValueError("bam2reference requires a reference sequence map") reference_fasta_map = build_reference_fasta_map( params.reference_fasta_map) fasta = resolve_argument(list(reference_fasta_map.values()), ",").split(",") retval, diff = get_reference_for_bam(infile, fasta) if retval is None: if diff is None: retval = "corrupted" else: retval = "unknown" E.debug("differences: {}".format(str(diff))) path = "" else: map_path2name = dict([(x[1], x[0]) for x in list(reference_fasta_map.items())]) path = map_path2name.get(retval, os.path.basename(retval)) with IOTools.open_file(outfile, "w") as outf: outf.write("filename\treference\tpath\n") outf.write("\t".join((infile, retval, path)) + "\n") return None
def run(self, infiles, outfile, params): if not outfile.endswith("-pass.fastq.gz"): raise ValueError( "outfile must end in -pass.fastq.gz, got {}".format(outfile)) if params.min_size_bytes: before = len(infiles) infiles = [ x for x in infiles if os.path.getsize(x) >= params.min_size_bytes ] E.debug( "removing small files: after={}, before={}, removed={}".format( len(infiles), before, before - len(infiles))) if params.newer_than: before = len(infiles) cutoff = os.path.getmtime(params.newer_than) infiles = [x for x in infiles if os.path.getmtime(x) > cutoff] E.debug( "removing old files: after={}, before={}, removed={}".format( len(infiles), before, before - len(infiles))) if len(infiles) == 0: E.warn("no files left after filtering, creating empty file") IOTools.touch_file(outfile) return infiles = " ".join(infiles) outfile_fail = IOTools.snip(outfile, "-pass.fastq.gz") + "-fail.fastq.gz" statement = ("zcat {infiles} " "| daisy fastq2fastq " "--method=filter-ONT " "--min-average-quality={params.min_average_quality} " "--log={outfile}.log " "--min-length={params.min_length} " "--output-removed-fastq={outfile_fail} " "- " "| gzip " "> {outfile}".format(**locals())) return P.run(statement)
def main(argv=None): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) (options, args) = E.start(parser, argv=argv, add_output_options=True) total_counter = E.Counter() table = [] for section, map_task2runner in [("tool", map_tool_to_runner), ("metric", map_metric_to_runner), ("split", map_split_to_runner), ("collate", map_collate_to_runner)]: E.debug("processing section: {}".format(section)) counter = E.Counter() for task, taskf in sorted(map_task2runner.items()): counter.ntasks += 1 comments = [] try: version = taskf().get_version() counter.version_ok += 1 except Exception: version = "" comments.append("unavailable") counter.version_fail += 1 comments = "; ".join(comments) table.append((section, task, version, comments)) E.info("{}: {}".format(section, counter)) total_counter += counter options.stdout.write("section\ttask\tversion\tcomments\n") for row in table: options.stdout.write("\t".join(map(str, row)) + "\n") E.info("{}: {}".format("total", counter)) E.stop()
def match_sequence_dictionaries(sequence_dict, fastafiles): """match a sequence dictionary (contig, length) against a collection of fasta files. :param sequence_dict: dictionary of contig/length pairs. :param fastafiles: list of :term:`fasta` formatted files. The fasta files need to indexed with samtools faidx. :return: a tuple (fastafn, diffs). Fastafn is the filename of the fasta file that has been matched, None if no match has been found. Diffs contains a list of discrepancies between sequence_dict and files in fastafiles that have been examined. If sequence_dict is empty, fastafn will be None and the list of diffs empty. """ if not sequence_dict: return None, [] fastafn = None diffs = [] # match by sequence dictionary with optional length for fastafn, fastadict in sequence_length_dicts_iterator(fastafiles): E.debug("inspected {}".format(fastafn)) contig_missing = None length_mismatch = None for reference, length in sequence_dict.items(): if reference not in fastadict: contig_missing = reference break if length > 0 and length != fastadict[reference]: length_mismatch = reference break if not (length_mismatch or contig_missing): break else: diffs.append((fastafn, contig_missing, length_mismatch)) return fastafn, diffs
def main(argv): options, args = P.parse_commandline(argv) if options.config_file: PARAMS = P.get_parameters(options.config_file) else: sys.exit(P.main(options, args)) with arvados_enabled(always_mount=options.always_mount): mountpoint = PARAMS.get("mount_point", None) if mountpoint: redirect_defaults2mountpoint(mountpoint) with LibraryContext(PARAMS, options, args, argv, "daisy"): # A selection of command line arguments are added to PARAMS # as 'extras' not implemented in ruffus 2.6.3 kwargs = collections.defaultdict(dict) if options.only_info: kwargs["extras"].update({'only_info': True}) P.PARAMS["only_info"] = True if options.is_test: kwargs["extras"].update({'is_test': True}) P.PARAMS["is_test"] = True E.debug("construction of workflow started") pipeline = ruffus.Pipeline('benchmark') # Tool execution suffix, tool_runners = add_tools_to_pipeline(pipeline, map_tool_to_runner, config=P.PARAMS, **kwargs) E.debug("added tools to workflow ") # Optionally, add externally computed files as # pseudo-tools: if "external" in P.PARAMS["setup"]: external_runners = add_external_data_to_pipeline( pipeline, config=P.PARAMS, **kwargs) tool_runners.extend(external_runners) # Optionally, combine tool runs into aggregate # outputs. The type of the output is preserved # (VCF -> VCF, etc.) # For example, call individual members in a trio # and then build a combined VCF to analyse mendelian # inconsistencies. if "collate" in P.PARAMS["setup"]: collate_runners = add_collations_to_pipeline( pipeline, map_collate_to_runner, P.PARAMS["setup"]["collate"], tasks=tool_runners, config=P.PARAMS) if P.PARAMS["setup"].get("only_collate", False): tool_runners = [] if P.PARAMS["setup"].get("no_collate_metrics", False): collate_runners = [] E.debug("added collators to workflow ") else: collate_runners = [] # Optionally, split up the output before applying # additional analyses. The type of the output is preserved # (VCF -> VCF, etc). # For example, identify false positives, false negatives # and true positives and collect metrics individually. if "split" in P.PARAMS["setup"]: split_runners = add_splits_to_pipeline( pipeline, map_split_to_runner, tool_runners, P.PARAMS["setup"]["split"], tasks=tool_runners, config=P.PARAMS) if P.PARAMS["setup"].get("only_split", False): tool_runners = [] E.debug("added splitters to workflow ") else: split_runners = [] metric_runners = [] for prefix, r in zip( ["tool", "collate", "split"], [tool_runners, collate_runners, split_runners]): if not r: continue metrics = None if prefix == "collate" and "collate_metrics" in P.PARAMS[ "setup"]: metrics = P.PARAMS["setup"]["collate_metrics"] elif prefix == "split" and "split_metrics" in P.PARAMS["setup"]: metrics = P.PARAMS["setup"]["split_metrics"] elif "metrics" in P.PARAMS["setup"]: metrics = P.PARAMS["setup"]["metrics"] else: raise KeyError( "configuration file requires a 'setup:metrics' section" ) # Metric execution mm = add_metrics_to_pipeline(pipeline, metrics, map_metric_to_runner, r, suffix=suffix, prefix=prefix + "_", config=P.PARAMS, **kwargs) if len(mm) == 0: raise ValueError( "workflow construction error: " "no metric tasks result for metrics {}".format( metrics)) metric_runners.extend(mm) E.debug("added {}_metrics to workflow".format(prefix)) # add plot task if "aggregate" in P.PARAMS["setup"]: aggregate_metrics = add_collations_to_pipeline( pipeline, map_collate_to_runner, P.PARAMS["setup"]["aggregate"], metric_runners, config=P.PARAMS) E.debug("added metric aggregation to workflow") else: aggregate_metrics = [] add_upload_to_pipeline(pipeline, metric_runners + aggregate_metrics, P.PARAMS) E.debug("added upload to workflow".format(prefix)) # add export task export = P.PARAMS["setup"].get("export", ["tools", "collate", "split"]) map_export2runner = { "collate": collate_runners, "tools": tool_runners, "split": split_runners } export_runners = [] for e in export: try: export_runners.extend(map_export2runner[e]) except KeyError: raise KeyError("unknown export section: {}".format(e)) add_export_to_pipeline(pipeline, export_runners, suffix=suffix, config=P.PARAMS) E.debug("added export to workflow") add_all_task_to_pipeline(pipeline, metric_runners + aggregate_metrics) # Collate output files to facilitate analysis if "collation" in P.PARAMS: collators = add_collations_to_pipeline(pipeline, map_collate_to_runner, P.PARAMS["collation"], config=P.PARAMS) E.debug("construction of workflow completed") E.stop()
def purge_run_id(run_id, url, dry_run=False, schemas=None): """remove a run from a database. """ engine = sqlalchemy.create_engine(url) connection = engine.connect() # automap metadata = sqlalchemy.MetaData() metadata.reflect(engine) base = automap_base(metadata=metadata) base.prepare() if schemas is None: insp = reflection.Inspector.from_engine(engine) schemas = insp.get_schema_names() # note: default sqlite schema is "main" if 'public' in schemas: schemas.remove('public') if 'information_schema' in schemas: schemas.remove('information_schema') E.debug("getting instance_id list of run_id={}".format(run_id)) instance_ids = set(get_instance_ids_for_run_id(run_id, engine)) E.debug("found {} instances for run_id={}".format(len(instance_ids), run_id)) non_metric_tables = [ 'run', 'arvados_job', 'instance', 'binary_data', 'metric_timings', 'tool_timings', 'metric_storage', 'tags' ] # delete from tables with field "instance_id" if instance_ids: for schema in schemas: # automap the schema metadata_schema = sqlalchemy.MetaData() metadata_schema.reflect(engine, schema=schema) base_schema = automap_base(metadata=metadata_schema) base_schema.prepare() for table_name in list(base_schema.metadata.tables.keys()): table = sqlalchemy.Table(table_name, metadata_schema, autoload=True) if "instance_id" not in table.c: continue E.info("deleting data in {}".format(table_name)) delete = table.delete().where( table.c.instance_id.in_(instance_ids)) # E.debug(delete) if not dry_run: connection.execute(delete) # delete from tables with field "run_id" for table_name in base.metadata.tables.keys(): table = sqlalchemy.Table(table_name, metadata, autoload=True) if "run_id" not in table.c: continue E.info("deleting data in {} for run_id {}".format(table_name, run_id)) delete = table.delete().where(table.c.run_id == run_id) # E.debug(delete) if not dry_run: connection.execute(delete) table = sqlalchemy.Table('run', metadata, autoload=True) delete = table.delete().where(table.c.id == run_id) E.info("deleting data in 'run' for id {}".format(run_id)) # E.debug(delete) if not dry_run: connection.execute(delete)