def fetch_pipelines(self, protocol): """ Fetch the mapping for a particular protocol, null if unmapped. :param str protocol: name/key for the protocol for which to fetch the pipeline(s) :return str | Iterable[str] | NoneType: pipeline(s) to which the given protocol is mapped, otherwise null """ protocol_key = utils.alpha_cased(protocol) return self.protocol_mapping.get(protocol_key)
def standardize_protocols(piface): """ Handle casing and punctuation of protocol keys in pipeline interface. :param Mapping piface: Pipeline interface data to standardize. :return Mapping: Same as the input, but with protocol keys case and punctuation handled in a more uniform way for matching later. """ assert PROTOMAP_KEY in piface, "For protocol mapping standardization, " \ "pipeline interface data must contain key '{}'".format(PROTOMAP_KEY) piface[PROTOMAP_KEY] = {utils.alpha_cased(proto): pipekey for proto, pipekey in piface[PROTOMAP_KEY].items()} return piface
def process_pipeline_interfaces(pipeline_interface_locations): """ Create a PipelineInterface for each pipeline location given. :param Iterable[str] pipeline_interface_locations: locations, each of which should be either a directory path or a filepath, that specifies pipeline interface and protocol mappings information. Each such file should have a pipelines section and a protocol mappings section. :return Mapping[str, Iterable[PipelineInterfaec]]: mapping from protocol name to interface(s) for which that protocol is mapped """ interface_by_protocol = defaultdict(list) for pipe_iface_location in pipeline_interface_locations: if not os.path.exists(pipe_iface_location): _LOGGER.warning( "Ignoring nonexistent pipeline interface " "location: '%s'", pipe_iface_location) continue pipe_iface = PipelineInterface(pipe_iface_location) for proto_name in pipe_iface.protomap: _LOGGER.log(5, "Adding protocol name: '%s'", proto_name) interface_by_protocol[alpha_cased(proto_name)].append(pipe_iface) return interface_by_protocol
def build_submission_bundles(self, protocol, priority=True): """ Create pipelines to submit for each sample of a particular protocol. With the argument (flag) to the priority parameter, there's control over whether to submit pipeline(s) from only one of the project's known pipeline locations with a match for the protocol, or whether to submit pipelines created from all locations with a match for the protocol. :param str protocol: name of the protocol/library for which to create pipeline(s) :param bool priority: to only submit pipeline(s) from the first of the pipelines location(s) (indicated in the project config file) that has a match for the given protocol; optional, default True :return Iterable[(PipelineInterface, type, str, str)]: :raises AssertionError: if there's a failure in the attempt to partition an interface's pipeline scripts into disjoint subsets of those already mapped and those not yet mapped """ protocol = alpha_cased(protocol) if not priority: raise NotImplementedError( "Currently, only prioritized protocol mapping is supported " "(i.e., pipeline interfaces collection is a prioritized list, " "so only the first interface with a protocol match is used.)") # Pull out the collection of interfaces (potentially one from each of # the locations indicated in the project configuration file) as a # sort of pool of information about possible ways in which to submit # pipeline(s) for sample(s) of the indicated protocol. try: pipeline_interfaces = \ self.interfaces_by_protocol[protocol] except KeyError: # Messaging can be done by the caller. _LOGGER.debug("No interface for protocol: %s", protocol) return [] job_submission_bundles = [] pipeline_keys_used = set() _LOGGER.debug("Building pipelines for {} interface(s)...".format( len(pipeline_interfaces))) bundle_by_strict_pipe_key = {} for pipe_iface in pipeline_interfaces: # "Break"-like mechanism for short-circuiting if we care only # about the highest-priority match for pipeline submission. # That is, if the intent is to submit pipeline(s) from a single # location for each sample of the given protocol, we can stop # searching the pool of pipeline interface information once we've # found a match for the protocol. if priority and len(job_submission_bundles) > 0: return job_submission_bundles[0] this_protocol_pipelines = pipe_iface.fetch_pipelines(protocol) if not this_protocol_pipelines: _LOGGER.debug("No pipelines; available: {}".format(", ".join( pipe_iface.protomap.keys()))) continue # TODO: update once dependency-encoding logic is in place. # The proposed dependency-encoding format uses a semicolon # between pipelines for which the dependency relationship is # serial. For now, simply treat those as multiple independent # pipelines by replacing the semicolon with a comma, which is the # way in which multiple independent pipelines for a single protocol # are represented in the mapping declaration. pipeline_keys = \ this_protocol_pipelines.replace(";", ",") \ .strip(" ()\n") \ .split(",") # These cleaned pipeline keys are what's used to resolve the path # to the pipeline to run. pipeline_keys = [pk.strip() for pk in pipeline_keys] # Skip over pipelines already mapped by another location. already_mapped, new_scripts = \ partition(pipeline_keys, partial(_is_member, items=pipeline_keys_used)) pipeline_keys_used |= set(pipeline_keys) # Attempt to validate that partition yielded disjoint subsets. try: disjoint_partition_violation = \ set(already_mapped) & set(new_scripts) except TypeError: _LOGGER.debug("Unable to hash partitions for validation") else: assert not disjoint_partition_violation, \ "Partitioning {} with membership in {} as " \ "predicate produced intersection: {}".format( pipeline_keys, pipeline_keys_used, disjoint_partition_violation) if len(already_mapped) > 0: _LOGGER.debug( "Skipping {} already-mapped script name(s): {}".format( len(already_mapped), already_mapped)) _LOGGER.debug("{} new scripts for protocol {} from " "pipeline(s) location '{}': {}".format( len(new_scripts), protocol, pipe_iface.source, new_scripts)) # For each pipeline script to which this protocol will pertain, # create the new jobs/submission bundles. new_jobs = [] for pipeline_key in new_scripts: # Determine how to reference the pipeline and where it is. strict_pipe_key, full_pipe_path, full_pipe_path_with_flags = \ pipe_iface.finalize_pipeline_key_and_paths( pipeline_key) # Skip and warn about nonexistent alleged pipeline path. if not (os.path.exists(full_pipe_path) or is_command_callable(full_pipe_path)): _LOGGER.warninging("Missing pipeline script: '%s'", full_pipe_path) continue # Determine which interface and Sample subtype to use. sample_subtype = \ pipe_iface.fetch_sample_subtype( protocol, strict_pipe_key, full_pipe_path) # Package the pipeline's interface, subtype, command, and key. submission_bundle = SubmissionBundle( pipe_iface, sample_subtype, strict_pipe_key, full_pipe_path_with_flags) # Enforce bundle uniqueness for each strict pipeline key. maybe_new_bundle = (full_pipe_path_with_flags, sample_subtype, pipe_iface) old_bundle = bundle_by_strict_pipe_key.setdefault( strict_pipe_key, maybe_new_bundle) if old_bundle != maybe_new_bundle: errmsg = "Strict pipeline key '{}' maps to more than " \ "one combination of pipeline script + flags, " \ "sample subtype, and pipeline interface. " \ "'{}'\n{}".format( strict_pipe_key, maybe_new_bundle, old_bundle) raise ValueError(errmsg) # Add this bundle to the collection of ones relevant for the # current PipelineInterface. new_jobs.append(submission_bundle) job_submission_bundles.append(new_jobs) # Repeat logic check of short-circuit conditional to account for # edge case in which it's satisfied during the final iteration. if priority and len(job_submission_bundles) > 1: return job_submission_bundles[0] else: return list(itertools.chain(*job_submission_bundles))
def __call__(self): """ Do the summarization. """ import csv columns = [] stats = [] objs = _pd.DataFrame() # First, the generic summarize will pull together all the fits # and stats from each sample into project-combined spreadsheets. # Create stats_summary file for sample in self.prj.samples: _LOGGER.info(self.counter.show(sample.sample_name, sample.protocol)) sample_output_folder = sample_folder(self.prj, sample) # Grab the basic info from the annotation sheet for this sample. # This will correspond to a row in the output. sample_stats = sample.get_sheet_dict() columns.extend(sample_stats.keys()) # Version 0.3 standardized all stats into a single file stats_file = os.path.join(sample_output_folder, "stats.tsv") if os.path.isfile(stats_file): _LOGGER.info("Using stats file: '%s'", stats_file) else: _LOGGER.warning("No stats file '%s'", stats_file) continue t = _pd.read_table( stats_file, header=None, names=['key', 'value', 'pl']) t.drop_duplicates(subset=['key', 'pl'], keep='last', inplace=True) # t.duplicated(subset= ['key'], keep = False) t.loc[:, 'plkey'] = t['pl'] + ":" + t['key'] dupes = t.duplicated(subset=['key'], keep=False) t.loc[dupes, 'key'] = t.loc[dupes, 'plkey'] sample_stats.update(t.set_index('key')['value'].to_dict()) stats.append(sample_stats) columns.extend(t.key.tolist()) self.counter.reset() # Create objects summary file for sample in self.prj.samples: # Process any reported objects _LOGGER.info(self.counter.show(sample.sample_name, sample.protocol)) sample_output_folder = sample_folder(self.prj, sample) objs_file = os.path.join(sample_output_folder, "objects.tsv") if os.path.isfile(objs_file): _LOGGER.info("Using objects file: '%s'", objs_file) else: _LOGGER.warning("No objects file '%s'", objs_file) continue t = _pd.read_table(objs_file, header=None, names=['key', 'filename', 'anchor_text', 'anchor_image', 'annotation']) t['sample_name'] = sample.name objs = objs.append(t, ignore_index=True) tsv_outfile_path = os.path.join(self.prj.metadata.output_dir, self.prj.name) if hasattr(self.prj, "subproject") and self.prj.subproject: tsv_outfile_path += '_' + self.prj.subproject tsv_outfile_path += '_stats_summary.tsv' tsv_outfile = open(tsv_outfile_path, 'w') tsv_writer = csv.DictWriter(tsv_outfile, fieldnames=uniqify(columns), delimiter='\t', extrasaction='ignore') tsv_writer.writeheader() for row in stats: tsv_writer.writerow(row) tsv_outfile.close() _LOGGER.info( "Summary (n=" + str(len(stats)) + "): " + tsv_outfile_path) # Next, looper can run custom summarizers, if they exist. all_protocols = [sample.protocol for sample in self.prj.samples] _LOGGER.debug("Protocols: " + str(all_protocols)) _LOGGER.debug(self.prj.interfaces_by_protocol) for protocol in set(all_protocols): try: ifaces = self.prj.interfaces_by_protocol[alpha_cased(protocol)] except KeyError: _LOGGER.warning("No interface for protocol '{}', skipping summary". format(protocol)) continue for iface in ifaces: _LOGGER.debug(iface) pl = iface.fetch_pipelines(protocol) summarizers = iface.get_attribute(pl, "summarizers") if summarizers is not None: for summarizer in set(summarizers): summarizer_abspath = os.path.join( os.path.dirname(iface.pipe_iface_file), summarizer) _LOGGER.debug([summarizer_abspath, self.prj.config_file]) try: subprocess.call([summarizer_abspath, self.prj.config_file]) except OSError: _LOGGER.warning("Summarizer was unable to run: " + str(summarizer)) # Produce HTML report report_builder = HTMLReportBuilder(self.prj) report_path = report_builder(objs, stats, uniqify(columns)) _LOGGER.info( "HTML Report (n=" + str(len(stats)) + "): " + report_path)
def __call__(self, args, remaining_args): """ Do the Sample submission. :param argparse.Namespace args: parsed command-line options and arguments, recognized by looper :param list remaining_args: command-line options and arguments not recognized by looper, germane to samples/pipelines """ protocols = {s.protocol for s in self.prj.samples if hasattr(s, "protocol")} failures = defaultdict(list) # Collect problems by sample. processed_samples = set() # Enforce one-time processing. _LOGGER.info("Finding pipelines for protocol(s): {}". format(", ".join(self.prj.protocols))) # Job submissions are managed on a per-pipeline basis so that # individual commands (samples) may be lumped into a single job. submission_conductors = {} pipe_keys_by_protocol = defaultdict(list) mapped_protos = set() for proto in protocols | {GENERIC_PROTOCOL_KEY}: proto_key = alpha_cased(proto) _LOGGER.debug("Determining sample type, script, and flags for " "pipeline(s) associated with protocol: %s", proto) submission_bundles = self.prj.build_submission_bundles(proto_key) if not submission_bundles: if proto_key != GENERIC_PROTOCOL_KEY: _LOGGER.warning("No mapping for protocol: '%s'", proto) continue mapped_protos.add(proto) for pl_iface, sample_subtype, pl_key, script_with_flags in \ submission_bundles: _LOGGER.debug("%s: %s", pl_key, sample_subtype.__name__) conductor = SubmissionConductor( pl_key, pl_iface, script_with_flags, self.prj, args.dry_run, args.time_delay, sample_subtype, remaining_args, args.ignore_flags, self.prj.compute, max_cmds=args.lumpn, max_size=args.lump) submission_conductors[pl_key] = conductor pipe_keys_by_protocol[proto_key].append(pl_key) # Determine number of samples eligible for processing. num_samples = len(self.prj.samples) if args.limit is None: upper_sample_bound = num_samples elif args.limit < 0: raise ValueError( "Invalid number of samples to run: {}".format(args.limit)) else: upper_sample_bound = min(args.limit, num_samples) _LOGGER.debug("Limiting to %d of %d samples", upper_sample_bound, num_samples) num_commands_possible = 0 failed_submission_scripts = [] for sample in self.prj.samples[:upper_sample_bound]: # First, step through the samples and determine whether any # should be skipped entirely, based on sample attributes alone # and independent of anything about any of its pipelines. # Start by displaying the sample index and a fresh collection # of sample-skipping reasons. _LOGGER.info(self.counter.show( sample.sample_name, sample.protocol)) skip_reasons = [] # Don't submit samples with duplicate names unless suppressed. if sample.sample_name in processed_samples: if args.allow_duplicate_names: _LOGGER.warning("Duplicate name detected, but submitting anyway") else: skip_reasons.append("Duplicate sample name") # Check if sample should be run. if sample.is_dormant(): skip_reasons.append( "Inactive status (via '{}' column/attribute)". format(SAMPLE_EXECUTION_TOGGLE)) # Get the base protocol-to-pipeline mappings. try: protocol = sample.protocol except AttributeError: skip_reasons.append("Sample has no protocol") else: if protocol not in mapped_protos and \ GENERIC_PROTOCOL_KEY not in mapped_protos: skip_reasons.append("No pipeline for protocol") if skip_reasons: _LOGGER.warning( "> Not submitted: {}".format(", ".join(skip_reasons))) failures[sample.name] = skip_reasons continue # Processing preconditions have been met. # Add this sample to the processed collection. processed_samples.add(sample.sample_name) # At this point, we have a generic Sample; write that to disk # for reuse in case of many jobs (pipelines) using base Sample. # Do a single overwrite here, then any subsequent Sample can be sure # that the file is fresh, with respect to this run of looper. sample.to_yaml(subs_folder_path=self.prj.metadata.submission_subdir) pipe_keys = pipe_keys_by_protocol.get(alpha_cased(sample.protocol)) \ or pipe_keys_by_protocol.get(GENERIC_PROTOCOL_KEY) _LOGGER.debug("Considering %d pipeline(s)", len(pipe_keys)) pl_fails = [] for pl_key in pipe_keys: num_commands_possible += 1 # TODO: of interest to track failures by pipeline? conductor = submission_conductors[pl_key] # TODO: check return value from add() to determine whether # TODO (cont.) to grow the failures list. try: curr_pl_fails = conductor.add_sample(sample) except JobSubmissionException as e: failed_submission_scripts.append(e.script) else: pl_fails.extend(curr_pl_fails) if pl_fails: failures[sample.name].extend(pl_fails) job_sub_total = 0 cmd_sub_total = 0 for conductor in submission_conductors.values(): conductor.submit(force=True) job_sub_total += conductor.num_job_submissions cmd_sub_total += conductor.num_cmd_submissions # Report what went down. max_samples = min(len(self.prj.samples), args.limit or float("inf")) _LOGGER.info("\nLooper finished") _LOGGER.info("Samples valid for job generation: %d of %d", len(processed_samples), max_samples) _LOGGER.info("Successful samples: %d of %d", max_samples - len(failures), max_samples) _LOGGER.info("Commands submitted: %d of %d", cmd_sub_total, num_commands_possible) _LOGGER.info("Jobs submitted: %d", job_sub_total) if args.dry_run: _LOGGER.info("Dry run. No jobs were actually submitted.") # Restructure sample/failure data for display. samples_by_reason = defaultdict(set) # Collect names of failed sample(s) by failure reason. for sample, failures in failures.items(): for f in failures: samples_by_reason[f].add(sample) # Collect samples by pipeline with submission failure. failed_samples_by_pipeline = defaultdict(set) for pl_key, conductor in submission_conductors.items(): # Don't add failure key if there are no samples that failed for # that reason. if conductor.failed_samples: fails = set(conductor.failed_samples) samples_by_reason[SUBMISSION_FAILURE_MESSAGE] |= fails failed_samples_by_pipeline[pl_key] |= fails failed_sub_samples = samples_by_reason.get(SUBMISSION_FAILURE_MESSAGE) if failed_sub_samples: _LOGGER.info("\n{} samples with at least one failed job submission: {}". format(len(failed_sub_samples), ", ".join(failed_sub_samples))) # If failure keys are only added when there's at least one sample that # failed for that reason, we can display information conditionally, # depending on whether there's actually failure(s). if samples_by_reason: _LOGGER.info("\n{} unique reasons for submission failure: {}".format( len(samples_by_reason), ", ".join(samples_by_reason.keys()))) full_fail_msgs = [create_failure_message(reason, samples) for reason, samples in samples_by_reason.items()] _LOGGER.info("\nSummary of failures:\n{}". format("\n".join(full_fail_msgs))) """
def fetch_sample_subtype( self, protocol, strict_pipe_key, full_pipe_path): """ Determine the interface and Sample subtype for a protocol and pipeline. :param str protocol: name of the relevant protocol :param str strict_pipe_key: key for specific pipeline in a pipeline interface mapping declaration; this must exactly match a key in the PipelineInterface (or the Mapping that represent it) :param str full_pipe_path: (absolute, expanded) path to the pipeline script :return type: Sample subtype to use for jobs for the given protocol, that use the pipeline indicated :raises KeyError: if given a pipeline key that's not mapped in the pipelines section of this PipelineInterface """ subtype = None this_pipeline_data = self.pipelines[strict_pipe_key] try: subtypes = this_pipeline_data[SUBTYPE_MAPPING_SECTION] except KeyError: _LOGGER.debug("Configuration (from %s) doesn't define section '%s' " "for pipeline '%s'", self.source, SUBTYPE_MAPPING_SECTION, strict_pipe_key) # Without a subtypes section, if pipeline module defines a single # Sample subtype, we'll assume that type is to be used when in # this case, when the interface section for this pipeline lacks # an explicit subtypes section specification. subtype_name = None else: if subtypes is None: # Designate lack of need for import attempt and provide # class with name to format message below. subtype = Sample _LOGGER.debug("Null %s subtype(s) section specified for " "pipeline: '%s'; using base %s type", subtype.__name__, strict_pipe_key, subtype.__name__) elif isinstance(subtypes, str): subtype_name = subtypes _LOGGER.debug("Single subtype name for pipeline '%s' " "in interface from '%s': '%s'", subtype_name, strict_pipe_key, self.source) else: temp_subtypes = { utils.alpha_cased(p): st for p, st in subtypes.items()} try: subtype_name = temp_subtypes[utils.alpha_cased(protocol)] except KeyError: # Designate lack of need for import attempt and provide # class with name to format message below. subtype = Sample _LOGGER.debug("No %s subtype specified in interface from " "'%s': '%s', '%s'; known: %s", subtype.__name__, self.source, strict_pipe_key, protocol, ", ".join(temp_subtypes.keys())) # subtype_name is defined if and only if subtype remained null. # The import helper function can return null if the import attempt # fails, so provide the base Sample type as a fallback. subtype = subtype or \ _import_sample_subtype(full_pipe_path, subtype_name) or \ Sample _LOGGER.debug("Using Sample subtype: %s", subtype.__name__) return subtype