def segmentlistdict(self): """ A segmentlistdict object describing the instruments and time spanned by this CacheEntry. A new object is constructed each time this attribute is accessed (segments are immutable so there is no reason to try to share a reference to the CacheEntry's internal segment; modifications of one would not be reflected in the other anyway). Example: >>> c = CacheEntry(u"H1 S5 815901601 576.5 file://localhost/home/kipp/tmp/1/H1-815901601-576.xml") >>> c.segmentlistdict {u'H1': [segment(LIGOTimeGPS(815901601, 0), LIGOTimeGPS(815902177, 500000000))]} The \"observatory\" column of the cache entry, which is frequently used to store instrument names, is parsed into instrument names for the dictionary keys using the same rules as pycbc_glue.ligolw.lsctables.instrument_set_from_ifos(). Example: >>> c = CacheEntry(u"H1H2, S5 815901601 576.5 file://localhost/home/kipp/tmp/1/H1H2-815901601-576.xml") >>> c.segmentlistdict {u'H1H2': [segment(LIGOTimeGPS(815901601, 0), LIGOTimeGPS(815902177, 500000000))]} """ # the import has to be done here to break the cyclic # dependancy from pycbc_glue.ligolw.lsctables import instrument_set_from_ifos instruments = instrument_set_from_ifos(self.observatory) or (None,) return segments.segmentlistdict((instrument, segments.segmentlist(self.segment is not None and [self.segment] or [])) for instrument in instruments)
def get_by_name(self, name, clip_to_valid = False): """ Retrieve the active segmentlists whose name equals name. The result is a segmentlistdict indexed by instrument. All segmentlist objects within it will be copies of the contents of this object, modifications will not affect the contents of this object. If clip_to_valid is True then the segmentlists will be intersected with their respective intervals of validity, otherwise they will be the verbatim active segments. NOTE: the intersection operation required by clip_to_valid will yield undefined results unless the active and valid segmentlist objects are coalesced. """ result = segments.segmentlistdict() for seglist in self: if seglist.name != name: continue segs = seglist.active if clip_to_valid: # do not use in-place intersection segs = segs & seglist.valid for instrument in seglist.instruments: if instrument in result: raise ValueError("multiple '%s' segmentlists for instrument '%s'" % (name, instrument)) result[instrument] = segs.copy() return result
def segmentlistdict(self): """ A segmentlistdict object describing the instruments and time spanned by this CacheEntry. A new object is constructed each time this attribute is accessed (segments are immutable so there is no reason to try to share a reference to the CacheEntry's internal segment; modifications of one would not be reflected in the other anyway). Example: >>> c = CacheEntry(u"H1 S5 815901601 576.5 file://localhost/home/kipp/tmp/1/H1-815901601-576.xml") >>> c.segmentlistdict {u'H1': [segment(LIGOTimeGPS(815901601, 0), LIGOTimeGPS(815902177, 500000000))]} The \"observatory\" column of the cache entry, which is frequently used to store instrument names, is parsed into instrument names for the dictionary keys using the same rules as pycbc_glue.ligolw.lsctables.instrument_set_from_ifos(). Example: >>> c = CacheEntry(u"H1H2, S5 815901601 576.5 file://localhost/home/kipp/tmp/1/H1H2-815901601-576.xml") >>> c.segmentlistdict {u'H1H2': [segment(LIGOTimeGPS(815901601, 0), LIGOTimeGPS(815902177, 500000000))]} """ # the import has to be done here to break the cyclic # dependancy from pycbc_glue.ligolw.lsctables import instrument_set_from_ifos instruments = instrument_set_from_ifos(self.observatory) or (None, ) return segments.segmentlistdict( (instrument, segments.segmentlist(self.segment is not None and [self.segment] or [])) for instrument in instruments)
def segmentlistdict_from_short_string(s, boundtype=int): """ Parse a string representation of a set of named segmentlists into a segmentlistdict object. The string encoding is that generated by segmentlistdict_to_short_string(). The optional boundtype argument will be passed to from_range_strings() when parsing the segmentlist objects from the string. Example: >>> segmentlistdict_from_short_string("H1=0:10,35,100:/L1=5:15,45:60") {'H1': [segment(0, 10), segment(35, 35), segment(100, infinity)], 'L1': [segment(5, 15), segment(45, 60)]} This function, and its inverse segmentlistdict_to_short_string(), are intended to be used to allow small segmentlistdict objects to be encoded in command line options and config files. For large segmentlistdict objects or when multiple sets of segmentlists are required, the LIGO Light Weight XML encoding available through the pycbc_glue.ligolw library should be used. """ d = segments.segmentlistdict() for token in s.strip().split("/"): key, ranges = token.strip().split("=") d[key.strip()] = from_range_strings(ranges.strip().split(","), boundtype=boundtype) return d
def to_segmentlistdict(self): """ Return a segmentlistdict object describing the instruments and times spanned by the entries in this Cache. The return value is coalesced. """ d = segments.segmentlistdict() for entry in self: d |= entry.segmentlistdict return d
def segmenttable_get_by_name(xmldoc, name): """ Retrieve the segmentlists whose name equals name. The result is a segmentlistdict indexed by instrument. The output of this function is not coalesced, each segmentlist contains the segments as found in the segment table. NOTE: this is a light-weight version of the .get_by_name() method of the LigolwSegments class intended for use when the full machinery of that class is not required. Considerably less document validation and error checking is performed by this version. Consider using that method instead if your application will be interfacing with the document via that class anyway. """ # # find required tables # def_table = lsctables.SegmentDefTable.get_table(xmldoc) seg_table = lsctables.SegmentTable.get_table(xmldoc) # # segment_def_id --> instrument names mapping but only for # segment_definer entries bearing the requested name # instrument_index = dict((row.segment_def_id, row.instruments) for row in def_table if row.name == name) # # populate result segmentlistdict object from segment_def_map table # and index # instruments = set(instrument for instruments in instrument_index.values() for instrument in instruments) result = segments.segmentlistdict((instrument, segments.segmentlist()) for instrument in instruments) for row in seg_table: if row.segment_def_id in instrument_index: seg = row.segment for instrument in instrument_index[row.segment_def_id]: result[instrument].append(seg) # # done # return result
def setup_datafind_workflow(workflow, scienceSegs, outputDir, seg_file=None, tags=None): """ Setup datafind section of the workflow. This section is responsible for generating, or setting up the workflow to generate, a list of files that record the location of the frame files needed to perform the analysis. There could be multiple options here, the datafind jobs could be done at run time or could be put into a dag. The subsequent jobs will know what was done here from the OutFileList containing the datafind jobs (and the Dagman nodes if appropriate. For now the only implemented option is to generate the datafind files at runtime. This module can also check if the frameFiles actually exist, check whether the obtained segments line up with the original ones and update the science segments to reflect missing data files. Parameters ---------- workflow: pycbc.workflow.core.Workflow The workflow class that stores the jobs that will be run. scienceSegs : Dictionary of ifo keyed glue.segment.segmentlist instances This contains the times that the workflow is expected to analyse. outputDir : path All output files written by datafind processes will be written to this directory. seg_file : SegFile, optional (default=None) The file returned by get_science_segments containing the science segments and the associated segment_summary. This will be used for the segment_summary test and is required if, and only if, performing that test. tags : list of string, optional (default=None) Use this to specify tags. This can be used if this module is being called more than once to give call specific configuration (by setting options in [workflow-datafind-${TAG}] rather than [workflow-datafind]). This is also used to tag the Files returned by the class to uniqueify the Files and uniqueify the actual filename. FIXME: Filenames may not be unique with current codes! Returns -------- datafindOuts : OutGroupList List of all the datafind output files for use later in the pipeline. sci_avlble_file : SegFile SegFile containing the analysable time after checks in the datafind module are applied to the input segment list. For production runs this is expected to be equal to the input segment list. scienceSegs : Dictionary of ifo keyed glue.segment.segmentlist instances This contains the times that the workflow is expected to analyse. If the updateSegmentTimes kwarg is given this will be updated to reflect any instances of missing data. sci_avlble_name : string The name with which the analysable time is stored in the sci_avlble_file. """ if tags is None: tags = [] logging.info("Entering datafind module") make_analysis_dir(outputDir) cp = workflow.cp # Parse for options in ini file datafind_method = cp.get_opt_tags("workflow-datafind", "datafind-method", tags) if cp.has_option_tags("workflow-datafind", "datafind-check-segment-gaps", tags): checkSegmentGaps = cp.get_opt_tags("workflow-datafind", "datafind-check-segment-gaps", tags) else: checkSegmentGaps = "no_test" if cp.has_option_tags("workflow-datafind", "datafind-check-frames-exist", tags): checkFramesExist = cp.get_opt_tags("workflow-datafind", "datafind-check-frames-exist", tags) else: checkFramesExist = "no_test" if cp.has_option_tags("workflow-datafind", "datafind-check-segment-summary", tags): checkSegmentSummary = cp.get_opt_tags( "workflow-datafind", "datafind-check-segment-summary", tags) else: checkSegmentSummary = "no_test" logging.info("Starting datafind with setup_datafind_runtime_generated") if datafind_method == "AT_RUNTIME_MULTIPLE_CACHES": datafindcaches, datafindouts = \ setup_datafind_runtime_cache_multi_calls_perifo(cp, scienceSegs, outputDir, tags=tags) elif datafind_method == "AT_RUNTIME_SINGLE_CACHES": datafindcaches, datafindouts = \ setup_datafind_runtime_cache_single_call_perifo(cp, scienceSegs, outputDir, tags=tags) elif datafind_method == "AT_RUNTIME_MULTIPLE_FRAMES": datafindcaches, datafindouts = \ setup_datafind_runtime_frames_multi_calls_perifo(cp, scienceSegs, outputDir, tags=tags) elif datafind_method == "AT_RUNTIME_SINGLE_FRAMES": datafindcaches, datafindouts = \ setup_datafind_runtime_frames_single_call_perifo(cp, scienceSegs, outputDir, tags=tags) elif datafind_method == "AT_RUNTIME_FAKE_DATA": pass elif datafind_method == "FROM_PREGENERATED_LCF_FILES": ifos = scienceSegs.keys() datafindcaches, datafindouts = \ setup_datafind_from_pregenerated_lcf_files(cp, ifos, outputDir, tags=tags) else: msg = """Entry datafind-method in [workflow-datafind] does not have " expected value. Valid values are AT_RUNTIME_MULTIPLE_FRAMES, AT_RUNTIME_SINGLE_FRAMES AT_RUNTIME_MULTIPLE_CACHES, AT_RUNTIME_SINGLE_CACHES, FROM_PREGENERATED_LCF_FILES, or AT_RUNTIME_FAKE_DATA. Consult the documentation for more info.""" raise ValueError(msg) using_backup_server = False if datafind_method == "AT_RUNTIME_MULTIPLE_FRAMES" or \ datafind_method == "AT_RUNTIME_SINGLE_FRAMES": if cp.has_option_tags("workflow-datafind", "datafind-backup-datafind-server", tags): using_backup_server = True backup_server = cp.get_opt_tags("workflow-datafind", "datafind-backup-datafind-server", tags) cp_new = copy.deepcopy(cp) cp_new.set("workflow-datafind", "datafind-ligo-datafind-server", backup_server) cp_new.set('datafind', 'urltype', 'gsiftp') backup_datafindcaches, backup_datafindouts =\ setup_datafind_runtime_frames_single_call_perifo(cp_new, scienceSegs, outputDir, tags=tags) backup_datafindouts = datafind_keep_unique_backups(\ backup_datafindouts, datafindouts) datafindcaches.extend(backup_datafindcaches) datafindouts.extend(backup_datafindouts) logging.info("setup_datafind_runtime_generated completed") # If we don't have frame files covering all times we can update the science # segments. if checkSegmentGaps in ['warn', 'update_times', 'raise_error']: logging.info("Checking science segments against datafind output....") newScienceSegs = get_science_segs_from_datafind_outs(datafindcaches) logging.info("New segments calculated from data find output.....") missingData = False for ifo in scienceSegs.keys(): # If no science segments in input then do nothing if not scienceSegs[ifo]: msg = "No science segments are present for ifo %s, " % (ifo) msg += "the segment metadata indicates there is no analyzable" msg += " strain data between the selected GPS start and end " msg += "times." logging.warning(msg) continue if not newScienceSegs.has_key(ifo): msg = "No data frames were found corresponding to the science " msg += "segments for ifo %s" % (ifo) logging.error(msg) missingData = True if checkSegmentGaps == 'update_times': scienceSegs[ifo] = segments.segmentlist() continue missing = scienceSegs[ifo] - newScienceSegs[ifo] if abs(missing): msg = "From ifo %s we are missing frames covering:" % (ifo) msg += "\n%s" % "\n".join(map(str, missing)) missingData = True logging.error(msg) if checkSegmentGaps == 'update_times': # Remove missing time, so that we can carry on if desired logging.info("Updating science segments for ifo %s." % (ifo)) scienceSegs[ifo] = scienceSegs[ifo] - missing if checkSegmentGaps == 'raise_error' and missingData: raise ValueError("Workflow cannot find needed data, exiting.") logging.info("Done checking, any discrepancies are reported above.") elif checkSegmentGaps == 'no_test': pass else: errMsg = "checkSegmentGaps kwarg must take a value from 'no_test', " errMsg += "'warn', 'update_times' or 'raise_error'." raise ValueError(errMsg) # Do all of the frame files that were returned actually exist? if checkFramesExist in ['warn', 'update_times', 'raise_error']: logging.info("Verifying that all frames exist on disk.") missingFrSegs, missingFrames = \ get_missing_segs_from_frame_file_cache(datafindcaches) missingFlag = False for ifo in missingFrames.keys(): # If no data in the input then do nothing if not scienceSegs[ifo]: continue # If using a backup server, does the frame exist remotely? if using_backup_server: # WARNING: This will be slow, but hopefully it will not occur # for too many frames. This could be optimized if # it becomes necessary. new_list = [] for frame in missingFrames[ifo]: for dfout in datafindouts: dfout_pfns = list(dfout.pfns) dfout_urls = [a.url for a in dfout_pfns] if frame.url in dfout_urls: pfn = dfout_pfns[dfout_urls.index(frame.url)] dfout.removePFN(pfn) if len(dfout.pfns) == 0: new_list.append(frame) else: msg = "Frame %s not found locally. "\ %(frame.url,) msg += "Replacing with remote url(s) %s." \ %(str([a.url for a in dfout.pfns]),) logging.info(msg) break else: new_list.append(frame) missingFrames[ifo] = new_list if missingFrames[ifo]: msg = "From ifo %s we are missing the following frames:" % ( ifo) msg += '\n'.join([a.url for a in missingFrames[ifo]]) missingFlag = True logging.error(msg) if checkFramesExist == 'update_times': # Remove missing times, so that we can carry on if desired logging.info("Updating science times for ifo %s." % (ifo)) scienceSegs[ifo] = scienceSegs[ifo] - missingFrSegs[ifo] if checkFramesExist == 'raise_error' and missingFlag: raise ValueError("Workflow cannot find all frames, exiting.") logging.info("Finished checking frames.") elif checkFramesExist == 'no_test': pass else: errMsg = "checkFramesExist kwarg must take a value from 'no_test', " errMsg += "'warn', 'update_times' or 'raise_error'." raise ValueError(errMsg) # Check if there are cases where frames exist, but no entry in the segment # summary table are present. if checkSegmentSummary in ['warn', 'raise_error']: logging.info("Checking the segment summary table against frames.") dfScienceSegs = get_science_segs_from_datafind_outs(datafindcaches) missingFlag = False # NOTE: Should this be overrideable in the config file? sci_seg_name = "SCIENCE" if seg_file is None: err_msg = "You must provide the science segments SegFile object " err_msg += "if using the datafind-check-segment-summary option." raise ValueError(err_msg) if seg_file.seg_summ_dict is None: err_msg = "The provided science segments SegFile object must " err_msg += "contain a valid segment_summary table if using the " err_msg += "datafind-check-segment-summary option." raise ValueError(err_msg) seg_summary_times = seg_file.seg_summ_dict for ifo in dfScienceSegs.keys(): curr_seg_summ_times = seg_summary_times[ifo + ":" + sci_seg_name] missing = (dfScienceSegs[ifo] & seg_file.valid_segments) missing.coalesce() missing = missing - curr_seg_summ_times missing.coalesce() scienceButNotFrame = scienceSegs[ifo] - dfScienceSegs[ifo] scienceButNotFrame.coalesce() missing2 = scienceSegs[ifo] - scienceButNotFrame missing2.coalesce() missing2 = missing2 - curr_seg_summ_times missing2.coalesce() if abs(missing): msg = "From ifo %s the following times have frames, " % (ifo) msg += "but are not covered in the segment summary table." msg += "\n%s" % "\n".join(map(str, missing)) logging.error(msg) missingFlag = True if abs(missing2): msg = "From ifo %s the following times have frames, " % (ifo) msg += "are science, and are not covered in the segment " msg += "summary table." msg += "\n%s" % "\n".join(map(str, missing2)) logging.error(msg) missingFlag = True if checkSegmentSummary == 'raise_error' and missingFlag: errMsg = "Segment_summary discrepancy detected, exiting." raise ValueError(errMsg) elif checkSegmentSummary == 'no_test': pass else: errMsg = "checkSegmentSummary kwarg must take a value from 'no_test', " errMsg += "'warn', or 'raise_error'." raise ValueError(errMsg) # Now need to create the file for SCIENCE_AVAILABLE sci_avlble_dict = segments.segmentlistdict() # NOTE: Should this be overrideable in the config file? sci_avlble_name = "SCIENCE_AVAILABLE" for ifo in scienceSegs.keys(): sci_avlble_dict[ifo + ':' + sci_avlble_name] = scienceSegs[ifo] sci_avlble_file = SegFile.from_segment_list_dict( 'SCIENCE_AVAILABLE', sci_avlble_dict, ifo_list=scienceSegs.keys(), valid_segment=workflow.analysis_time, extension='.xml', tags=tags, directory=outputDir) logging.info("Leaving datafind module") if datafind_method == "AT_RUNTIME_FAKE_DATA": datafindouts = None else: datafindouts = FileList(datafindouts) return datafindouts, sci_avlble_file, scienceSegs, sci_avlble_name
def setup_datafind_workflow(workflow, scienceSegs, outputDir, seg_file=None, tags=None): """ Setup datafind section of the workflow. This section is responsible for generating, or setting up the workflow to generate, a list of files that record the location of the frame files needed to perform the analysis. There could be multiple options here, the datafind jobs could be done at run time or could be put into a dag. The subsequent jobs will know what was done here from the OutFileList containing the datafind jobs (and the Dagman nodes if appropriate. For now the only implemented option is to generate the datafind files at runtime. This module can also check if the frameFiles actually exist, check whether the obtained segments line up with the original ones and update the science segments to reflect missing data files. Parameters ---------- workflow: pycbc.workflow.core.Workflow The workflow class that stores the jobs that will be run. scienceSegs : Dictionary of ifo keyed glue.segment.segmentlist instances This contains the times that the workflow is expected to analyse. outputDir : path All output files written by datafind processes will be written to this directory. seg_file : SegFile, optional (default=None) The file returned by get_science_segments containing the science segments and the associated segment_summary. This will be used for the segment_summary test and is required if, and only if, performing that test. tags : list of string, optional (default=None) Use this to specify tags. This can be used if this module is being called more than once to give call specific configuration (by setting options in [workflow-datafind-${TAG}] rather than [workflow-datafind]). This is also used to tag the Files returned by the class to uniqueify the Files and uniqueify the actual filename. FIXME: Filenames may not be unique with current codes! Returns -------- datafindOuts : OutGroupList List of all the datafind output files for use later in the pipeline. sci_avlble_file : SegFile SegFile containing the analysable time after checks in the datafind module are applied to the input segment list. For production runs this is expected to be equal to the input segment list. scienceSegs : Dictionary of ifo keyed glue.segment.segmentlist instances This contains the times that the workflow is expected to analyse. If the updateSegmentTimes kwarg is given this will be updated to reflect any instances of missing data. sci_avlble_name : string The name with which the analysable time is stored in the sci_avlble_file. """ if tags is None: tags = [] logging.info("Entering datafind module") make_analysis_dir(outputDir) cp = workflow.cp # Parse for options in ini file datafindMethod = cp.get_opt_tags("workflow-datafind", "datafind-method", tags) if cp.has_option_tags("workflow-datafind", "datafind-check-segment-gaps", tags): checkSegmentGaps = cp.get_opt_tags("workflow-datafind", "datafind-check-segment-gaps", tags) else: checkSegmentGaps = "no_test" if cp.has_option_tags("workflow-datafind", "datafind-check-frames-exist", tags): checkFramesExist = cp.get_opt_tags("workflow-datafind", "datafind-check-frames-exist", tags) else: checkFramesExist = "no_test" if cp.has_option_tags("workflow-datafind", "datafind-check-segment-summary", tags): checkSegmentSummary = cp.get_opt_tags("workflow-datafind", "datafind-check-segment-summary", tags) else: checkSegmentSummary = "no_test" logging.info("Starting datafind with setup_datafind_runtime_generated") if datafindMethod == "AT_RUNTIME_MULTIPLE_CACHES": datafindcaches, datafindouts = \ setup_datafind_runtime_cache_multi_calls_perifo(cp, scienceSegs, outputDir, tags=tags) elif datafindMethod == "AT_RUNTIME_SINGLE_CACHES": datafindcaches, datafindouts = \ setup_datafind_runtime_cache_single_call_perifo(cp, scienceSegs, outputDir, tags=tags) elif datafindMethod == "AT_RUNTIME_MULTIPLE_FRAMES": datafindcaches, datafindouts = \ setup_datafind_runtime_frames_multi_calls_perifo(cp, scienceSegs, outputDir, tags=tags) elif datafindMethod == "AT_RUNTIME_SINGLE_FRAMES": datafindcaches, datafindouts = \ setup_datafind_runtime_frames_single_call_perifo(cp, scienceSegs, outputDir, tags=tags) elif datafindMethod == "FROM_PREGENERATED_LCF_FILES": ifos = scienceSegs.keys() datafindcaches, datafindouts = \ setup_datafind_from_pregenerated_lcf_files(cp, ifos, outputDir, tags=tags) else: msg = "Entry datafind-method in [workflow-datafind] does not have " msg += "expected value. Valid values are " msg += "AT_RUNTIME_MULTIPLE_FRAMES, AT_RUNTIME_SINGLE_FRAMES " msg += "AT_RUNTIME_MULTIPLE_CACHES or AT_RUNTIME_SINGLE_CACHES. " msg += "Consult the documentation for more info." raise ValueError(msg) using_backup_server = False if datafindMethod == "AT_RUNTIME_MULTIPLE_FRAMES" or \ datafindMethod == "AT_RUNTIME_SINGLE_FRAMES": if cp.has_option_tags("workflow-datafind", "datafind-backup-datafind-server", tags): using_backup_server = True backup_server = cp.get_opt_tags("workflow-datafind", "datafind-backup-datafind-server", tags) cp_new = copy.deepcopy(cp) cp_new.set("workflow-datafind", "datafind-ligo-datafind-server", backup_server) cp_new.set('datafind', 'urltype', 'gsiftp') backup_datafindcaches, backup_datafindouts =\ setup_datafind_runtime_frames_single_call_perifo(cp_new, scienceSegs, outputDir, tags=tags) backup_datafindouts = datafind_keep_unique_backups(\ backup_datafindouts, datafindouts) datafindcaches.extend(backup_datafindcaches) datafindouts.extend(backup_datafindouts) logging.info("setup_datafind_runtime_generated completed") # If we don't have frame files covering all times we can update the science # segments. if checkSegmentGaps in ['warn','update_times','raise_error']: logging.info("Checking science segments against datafind output....") newScienceSegs = get_science_segs_from_datafind_outs(datafindcaches) logging.info("New segments calculated from data find output.....") missingData = False for ifo in scienceSegs.keys(): # If no science segments in input then do nothing if not scienceSegs[ifo]: msg = "No science segments are present for ifo %s, " %(ifo) msg += "the segment metadata indicates there is no analyzable" msg += " strain data between the selected GPS start and end " msg += "times." logging.warning(msg) continue if not newScienceSegs.has_key(ifo): msg = "No data frames were found corresponding to the science " msg += "segments for ifo %s" %(ifo) logging.error(msg) missingData = True if checkSegmentGaps == 'update_times': scienceSegs[ifo] = segments.segmentlist() continue missing = scienceSegs[ifo] - newScienceSegs[ifo] if abs(missing): msg = "From ifo %s we are missing frames covering:" %(ifo) msg += "\n%s" % "\n".join(map(str, missing)) missingData = True logging.error(msg) if checkSegmentGaps == 'update_times': # Remove missing time, so that we can carry on if desired logging.info("Updating science segments for ifo %s." %(ifo)) scienceSegs[ifo] = scienceSegs[ifo] - missing if checkSegmentGaps == 'raise_error' and missingData: raise ValueError("Workflow cannot find needed data, exiting.") logging.info("Done checking, any discrepancies are reported above.") elif checkSegmentGaps == 'no_test': pass else: errMsg = "checkSegmentGaps kwarg must take a value from 'no_test', " errMsg += "'warn', 'update_times' or 'raise_error'." raise ValueError(errMsg) # Do all of the frame files that were returned actually exist? if checkFramesExist in ['warn','update_times','raise_error']: logging.info("Verifying that all frames exist on disk.") missingFrSegs, missingFrames = \ get_missing_segs_from_frame_file_cache(datafindcaches) missingFlag = False for ifo in missingFrames.keys(): # If no data in the input then do nothing if not scienceSegs[ifo]: continue # If using a backup server, does the frame exist remotely? if using_backup_server: # WARNING: This will be slow, but hopefully it will not occur # for too many frames. This could be optimized if # it becomes necessary. new_list = [] for frame in missingFrames[ifo]: for dfout in datafindouts: dfout_pfns = list(dfout.pfns) dfout_urls = [a.url for a in dfout_pfns] if frame.url in dfout_urls: pfn = dfout_pfns[dfout_urls.index(frame.url)] dfout.removePFN(pfn) if len(dfout.pfns) == 0: new_list.append(frame) else: msg = "Frame %s not found locally. "\ %(frame.url,) msg += "Replacing with remote url(s) %s." \ %(str([a.url for a in dfout.pfns]),) logging.info(msg) break else: new_list.append(frame) missingFrames[ifo] = new_list if missingFrames[ifo]: msg = "From ifo %s we are missing the following frames:" %(ifo) msg +='\n'.join([a.url for a in missingFrames[ifo]]) missingFlag = True logging.error(msg) if checkFramesExist == 'update_times': # Remove missing times, so that we can carry on if desired logging.info("Updating science times for ifo %s." %(ifo)) scienceSegs[ifo] = scienceSegs[ifo] - missingFrSegs[ifo] if checkFramesExist == 'raise_error' and missingFlag: raise ValueError("Workflow cannot find all frames, exiting.") logging.info("Finished checking frames.") elif checkFramesExist == 'no_test': pass else: errMsg = "checkFramesExist kwarg must take a value from 'no_test', " errMsg += "'warn', 'update_times' or 'raise_error'." raise ValueError(errMsg) # Check if there are cases where frames exist, but no entry in the segment # summary table are present. if checkSegmentSummary in ['warn', 'raise_error']: logging.info("Checking the segment summary table against frames.") dfScienceSegs = get_science_segs_from_datafind_outs(datafindcaches) missingFlag = False # NOTE: Should this be overrideable in the config file? sci_seg_name = "SCIENCE" if seg_file is None: err_msg = "You must provide the science segments SegFile object " err_msg += "if using the datafind-check-segment-summary option." raise ValueError(err_msg) if seg_file.seg_summ_dict is None: err_msg = "The provided science segments SegFile object must " err_msg += "contain a valid segment_summary table if using the " err_msg += "datafind-check-segment-summary option." raise ValueError(err_msg) seg_summary_times = seg_file.seg_summ_dict for ifo in dfScienceSegs.keys(): curr_seg_summ_times = seg_summary_times[ifo + ":" + sci_seg_name] missing = (dfScienceSegs[ifo] & seg_file.valid_segments) missing.coalesce() missing = missing - curr_seg_summ_times missing.coalesce() scienceButNotFrame = scienceSegs[ifo] - dfScienceSegs[ifo] scienceButNotFrame.coalesce() missing2 = scienceSegs[ifo] - scienceButNotFrame missing2.coalesce() missing2 = missing2 - curr_seg_summ_times missing2.coalesce() if abs(missing): msg = "From ifo %s the following times have frames, " %(ifo) msg += "but are not covered in the segment summary table." msg += "\n%s" % "\n".join(map(str, missing)) logging.error(msg) missingFlag = True if abs(missing2): msg = "From ifo %s the following times have frames, " %(ifo) msg += "are science, and are not covered in the segment " msg += "summary table." msg += "\n%s" % "\n".join(map(str, missing2)) logging.error(msg) missingFlag = True if checkSegmentSummary == 'raise_error' and missingFlag: errMsg = "Segment_summary discrepancy detected, exiting." raise ValueError(errMsg) elif checkSegmentSummary == 'no_test': pass else: errMsg = "checkSegmentSummary kwarg must take a value from 'no_test', " errMsg += "'warn', or 'raise_error'." raise ValueError(errMsg) # Now need to create the file for SCIENCE_AVAILABLE sci_avlble_dict = segments.segmentlistdict() # NOTE: Should this be overrideable in the config file? sci_avlble_name = "SCIENCE_AVAILABLE" for ifo in scienceSegs.keys(): sci_avlble_dict[ifo + ':' + sci_avlble_name] = scienceSegs[ifo] sci_avlble_file = SegFile.from_segment_list_dict('SCIENCE_AVAILABLE', sci_avlble_dict, ifo_list = scienceSegs.keys(), valid_segment=workflow.analysis_time, extension='.xml', tags=tags, directory=outputDir) logging.info("Leaving datafind module") return FileList(datafindouts), sci_avlble_file, scienceSegs, sci_avlble_name