def update_run_report(date_folders): """ List of date folders in the form MM_DD_YY that you want to update. @param date_folders: """ # fetch utags in run report collection db_utags = _DB_CONNECTOR.distinct(RUN_REPORT_COLLECTION, UTAG) if os.path.isdir(RUN_REPORT_PATH): reports = list() for folder in date_folders: path = os.path.join(RUN_REPORT_PATH, folder) if not os.path.isdir(path): continue date_obj = datetime.strptime(folder, '%m_%d_%y') for sf in os.listdir(path): report_file_path = get_run_info_path(path, sf) if report_file_path is None: continue utag = set_utag(date_obj, sf) if utag not in db_utags: # if not exists, need to insert to collection log_data = read_report_file(report_file_path, date_obj, utag) if log_data is None: log_data = {DATETIME: date_obj, UTAG: utag} if IMAGE_STACKS in log_data: hdf5_datasets= get_hdf5_datasets(log_data, folder, sf) log_data[IMAGE_STACKS].extend(hdf5_datasets) reports.append(log_data) print report_file_path else: # if exists, check HDF5 collection for new datasets log_data = _DB_CONNECTOR.find_one(RUN_REPORT_COLLECTION, UTAG, utag) # If previously a run report was not there or had wrong format, # the mongo documents only has three fields, _id, datetime, and # unique_tag. If this occurs, try reading the run report again. if len(log_data.keys()) == 3: log_data = read_report_file(report_file_path, date_obj, utag) if log_data is not None and IMAGE_STACKS in log_data: hdf5_datasets = get_hdf5_datasets(log_data, folder, sf) exist_datasets = log_data[IMAGE_STACKS] if set(hdf5_datasets) - set(exist_datasets): updated_datasets = list(set(hdf5_datasets) | set(exist_datasets)) _DB_CONNECTOR.update( RUN_REPORT_COLLECTION, {UTAG: utag}, {"$set": {IMAGE_STACKS: updated_datasets}}) APP_LOGGER.info("Found %d run reports" % (len(reports))) if len(reports) > 0: # There is a possible race condition here. Ideally these operations # would be performed in concert atomically _DB_CONNECTOR.insert(RUN_REPORT_COLLECTION, reports)
def parse_pa_data_src(pa_data_src_name): """ Determine primary analysis data source type (HDF5 or image stack) and return a list containing the archive paths and dataset names @param pa_data_src_name: String, name of data source, could be either the HDF5 dataset name or a folder name containing image stacks @return: A list of tuples, each tuple contains the primary analysis datasource name and a bool indicating whether or not it is HDF5. """ # archives is a list of tuples, each tuple contains the path and the dataset name archives = list() if is_hdf5_archive(pa_data_src_name): archives.append((pa_data_src_name, True)) APP_LOGGER.info('%s is an HDF5 file.' % pa_data_src_name) elif is_image_archive(pa_data_src_name): image_archive_paths = io_utilities.get_archive_dirs( pa_data_src_name, min_num_images=PA_MIN_NUM_IMAGES) for img_src_name in image_archive_paths: archives.append(( img_src_name, False, )) APP_LOGGER.info('%s is an image stack.' % pa_data_src_name) else: raise Exception( 'Unable to determine if %s is an image stack or HDF5 file.' % pa_data_src_name) return archives
def update_image_stacks(log_data, data_folder): """ Check whether the image_stacks in a run report document exist in archive collection. If not, add them to database. @param log_data: the document of run report yaml @param date_folder: folder where data is located """ if log_data is None or IMAGE_STACKS not in log_data: return new_records = list() for image_stack in log_data[IMAGE_STACKS]: exist_record = _DB_CONNECTOR.find_one(ARCHIVES_COLLECTION, ARCHIVE, image_stack) if not exist_record: for folder in [ARCHIVES_PATH, data_folder]: archive_path = os.path.join(folder, image_stack) if os.path.isdir(archive_path): new_records.append({ ARCHIVE: image_stack, ARCHIVE_PATH: remove_disk_directory(archive_path) }) break if new_records: APP_LOGGER.info('Found %d image stacks: %s' % (len(new_records), new_records)) _DB_CONNECTOR.insert(ARCHIVES_COLLECTION, new_records)
def process_request(cls, params_dict): dataset = params_dict[cls.dataset_parameter][0] report_uuid = params_dict[cls.report_uuid_parameter][0] http_status_code = 200 json_response = {RUN_REPORT_UUID: report_uuid, HDF5_DATASET: dataset} try: cls._DB_CONNECTOR.update( RUN_REPORT_COLLECTION, {UUID: report_uuid}, {'$pull': { IMAGE_STACKS: { 'name': dataset, 'upload': True } }}) cls._DB_CONNECTOR.remove(HDF5_COLLECTION, {HDF5_DATASET: dataset}) json_response.update({"unassociate": True}) APP_LOGGER.info("Removed dataset name=%s from run report uuid=%s" % (dataset, report_uuid)) except: APP_LOGGER.exception(traceback.format_exc()) json_response[ERROR] = str(sys.exc_info()[1]) http_status_code = 500 return json_response, http_status_code
def process_request(cls, params_dict): tags = [t for t in params_dict[cls.tags_parameter] if t] report_uuid = params_dict[cls.report_uuid_parameter][0] http_status_code = 200 json_response = {RUN_REPORT_UUID: report_uuid, TAGS: tags} try: cls._DB_CONNECTOR.update(RUN_REPORT_COLLECTION, {UUID: report_uuid}, {'$addToSet': { TAGS: { '$each': tags } }}) APP_LOGGER.info("Updated run report uuid=%s with tags %s." % (report_uuid, tags)) json_response[STATUS] = SUCCEEDED except: APP_LOGGER.exception(traceback.format_exc()) json_response[STATUS] = FAILED json_response[ERROR] = str(sys.exc_info()[1]) http_status_code = 500 return make_clean_response(json_response, http_status_code)
def _generate(self, ndyes, nchoose=5): """ @param ndyes: 1nteger, number of dyes to use per solution #param nchoose: Integer, maximum number of combinations that will be further optimized """ # check to see if the minimum maximum levels of dyes can make the requested number of dyes min_nbarcodes = numpy.product(self._barcode_min_nlvls[numpy.argsort(self._barcode_min_nlvls)[:ndyes]]) max_nbarcodes = numpy.product(self._barcode_max_nlvls[numpy.argsort(self._barcode_max_nlvls)[-ndyes:]]) # too many dyes were selected if min_nbarcodes > self._requested_nbarcodes: APP_LOGGER.info('Cannot generate requested number of barcodes (%d). ' 'Smallest library would have %d barcodes.' % (self._requested_nbarcodes, min_nbarcodes)) return # too few dyes were selected if max_nbarcodes < self._requested_nbarcodes: APP_LOGGER.info('Cannot generate requested number of barcodes (%d). ' 'Largest library would have %d barcodes.' % (self._requested_nbarcodes, max_nbarcodes)) return # find the optimal number of levels for each dye combination requested_dye_idxs = set(range(len(self._requested_dye_lots))) optimal_nlvls = list() for dye_idxs in itertools.combinations(xrange(len(self._barcode_profiles)), ndyes): dye_idxs = numpy.array(dye_idxs) # ignore combinations that do not include requested dyes if self.need_additional_db_dyes and \ self._requested_dye_lots and \ not requested_dye_idxs.issubset(dye_idxs): continue # ignore combinations in which the peaks are too close peaks = numpy.concatenate((self._barcode_peaks[dye_idxs], self._non_barcode_peaks)) if numpy.any(numpy.diff(numpy.sort(peaks)) < self._min_peak_difference): continue try: candidate_nlvls, candidate_lowest_peak = self._calc_optimal_nlvls(dye_idxs) optimal_nlvls.append((candidate_lowest_peak, dye_idxs, candidate_nlvls)) except Exception as e: APP_LOGGER.exception(e) optimal_nlvls.sort(key=lambda x: x[0]) for _, dye_idxs, nlvls in optimal_nlvls[: nchoose]: try: self._make_design(nlvls, dye_idxs) except Exception as e: APP_LOGGER.exception(e)
def get_hdf5_datasets(log_data, data_folder): """ Fetch the HDF5 archives associated with a run report. @param log_data: the document of run report yaml @param date_folder: folder where data is located """ if log_data is None or RUN_ID not in log_data: return set() run_id = log_data[RUN_ID] hdf5_paths = [ os.path.join(data_folder, f + '.h5') for f in [run_id, run_id + '-baseline'] if os.path.isfile(os.path.join(data_folder, f + '.h5')) ] all_datasets = set() for path in hdf5_paths: exist_records = _DB_CONNECTOR.find( HDF5_COLLECTION, {HDF5_PATH: remove_disk_directory(path)}) if exist_records: all_datasets.update(set(r[HDF5_DATASET] for r in exist_records)) continue new_records = list() try: with h5py.File(path) as h5_file: dataset_names = h5_file.keys() for dsname in dataset_names: if re.match(r'^\d{4}-\d{2}-\d{2}_\d{4}\.\d{2}', dsname): new_records.append({ HDF5_PATH: remove_disk_directory(path), HDF5_DATASET: dsname, }) except: APP_LOGGER.exception( 'Unable to get dataset information from HDF5 file: %s' % path) if new_records: APP_LOGGER.info('Found %d datasets from HDF5 file: %s' % (len(new_records), path)) _DB_CONNECTOR.insert(HDF5_COLLECTION, new_records) all_datasets.update(set(r[HDF5_DATASET] for r in new_records)) return all_datasets
def get_variants(exp_def_name): """ Return a list of variants in the experiment definition file. """ APP_LOGGER.info("Retrieving list of variants from %s" % (exp_def_name, )) exp_def_doc = _DB_CONNECTOR.find_one(EXP_DEF_COLLECTION, NAME, exp_def_name) if exp_def_doc is not None: APP_LOGGER.info( "Experiment definition %s found in EXP_DEF_COLLECTION." % (exp_def_name, )) return exp_def_doc[VARIANTS] APP_LOGGER.debug( "Failed to find experiment definition %s from EXP_DEF_COLLECTION." % (exp_def_name, )) return []
def get_experiment_defintions(): """ Retrieve experiment definition from EXP_DEF_COLLECTION. """ columns = OrderedDict() columns[ID] = 0 columns[UUID] = 1 columns[NAME] = 1 columns[VARIANTS] = 1 columns[DYES] = 1 columns[TYPE] = 1 column_names = columns.keys() column_names.remove(ID) exp_defs = _DB_CONNECTOR.find(EXP_DEF_COLLECTION, {}, columns) APP_LOGGER.info('Retrieved %d experiment definitions.' \ % (len(exp_defs), )) return (exp_defs, column_names, None)
def process_request(cls, params_dict): tag = params_dict[cls.tag_parameter][0] report_uuid = params_dict[cls.report_uuid_parameter][0] http_status_code = 200 json_response = {RUN_REPORT_UUID: report_uuid, TAGS: [tag]} try: cls._DB_CONNECTOR.update(RUN_REPORT_COLLECTION, {UUID: report_uuid}, {'$pull': { TAGS: tag }}) json_response[STATUS] = SUCCEEDED APP_LOGGER.info("Removed tag name=%s from run report uuid=%s" % (tag, report_uuid)) except: APP_LOGGER.exception(traceback.format_exc()) json_response[ERROR] = str(sys.exc_info()[1]) json_response[STATUS] = FAILED http_status_code = 500 return json_response, http_status_code
def process_request(cls, params_dict, del_file_keys=(RESULT,)): response = {} http_status_code = 200 uuids = params_dict[ParameterFactory.job_uuid(cls.get_collection())] criteria = {UUID: {"$in": uuids}} APP_LOGGER.info("Deleting the following jobs: %s" % ",".join(uuids)) records = cls._DB_CONNECTOR.find(cls.get_collection(), criteria, {ID:0}) response["deleted"] = {} if len(records) > 0: # Record records for record in records: response["deleted"][record[UUID]] = record # Delete records from database result = cls._DB_CONNECTOR.remove(cls.get_collection(), criteria) # Delete files from disk only if removal from DB was successful if result and result['n'] == len(response["deleted"]): for _,record in response["deleted"].iteritems(): for key in del_file_keys: file_path = record.get(key, None) if file_path is not None and os.path.isfile(file_path): os.remove(file_path) else: del response["deleted"] raise Exception("Error deleting records from the " \ "database: %s" % result) APP_LOGGER.info("Successfully deleted the following jobs: %s" \ % ",".join(uuids)) else: http_status_code = 404 return response, http_status_code
def gen_dye_scatterplot(dyes, sys_listener_path): try: analysis_df = pandas.read_table(self.analysis_file, sep=sniff_delimiter( self.analysis_file)) ac_df = pandas.read_table(self.tmp_outfile_path, sep=sniff_delimiter( self.tmp_outfile_path)) analysis_df['assay'] = False analysis_df.loc[analysis_df['identity'].notnull(), 'assay'] = ac_df['assay'].values # System listener inputs dyn_align_offsets = {} temps = {} steps = {} if sys_listener_path is not None: sys_listener_dir = os.path.dirname(sys_listener_path) clamp_temp_tp = ClampTempTopicParser() old_channel_offset_tp = OldChannelOffsetTopicParser() channel_offset_tp = ChannelOffsetTopicParser() dyn_align_steps_tp = DynamicAlignStepsParser() topic_parsers = [ clamp_temp_tp, old_channel_offset_tp, channel_offset_tp, dyn_align_steps_tp ] sys_listener_parser = SystemListenerParser( sys_listener_dir, topic_parsers=topic_parsers) temps = sys_listener_parser.get_topic_results( clamp_temp_tp.topic) dyn_align_offsets = sys_listener_parser.get_topic_results( channel_offset_tp.topic) if len(dyn_align_offsets) < 1: APP_LOGGER.info("Using old channel offset parser...") dyn_align_offsets = sys_listener_parser.get_topic_results( old_channel_offset_tp.topic) else: APP_LOGGER.info("Using new channel offset parser...") steps = sys_listener_parser.get_topic_results( dyn_align_steps_tp.topic) generate_dye_scatterplots(analysis_df, dyes, self.tmp_dyes_plot_path, self.job_name, self.pico1_dye, dyn_align_offsets=dyn_align_offsets, temps=temps, steps=steps) shutil.copy(self.tmp_dyes_plot_path, self.dyes_plot_path) APP_LOGGER.info("Dyes scatter plot generated for %s." % \ self.job_name) except: APP_LOGGER.exception("Dyes scatter plot generation failed.")
def update_archives(): ''' Update the database with available primary analysis archives. It is not an error if zero archives are available at this moment. @return True if database is successfully updated, False otherwise ''' APP_LOGGER.info("Updating database with available archives...") exist_archives = _DB_CONNECTOR.distinct(ARCHIVES_COLLECTION, ARCHIVE) if os.path.isdir(ARCHIVES_PATH): # Remove archives named similarly (same name, different capitalization) archives = io_utilities.get_subfolders(ARCHIVES_PATH) # Check yyyy_mm/dd/HHMM_pilotX location run_folders = get_run_folders() for folder in run_folders: archives.extend(io_utilities.get_subfolders(folder)) new_archives = [ x for x in archives if os.path.basename(x) not in exist_archives ] records = [{ ARCHIVE: os.path.basename(archive), ARCHIVE_PATH: remove_disk_directory(archive) } for archive in new_archives] APP_LOGGER.info("Found %d archives" % (len(records))) if len(records) > 0: # There is a possible race condition here. Ideally these operations # would be performed in concert atomically _DB_CONNECTOR.insert(ARCHIVES_COLLECTION, records) else: APP_LOGGER.error( "Couldn't locate archives path '%s', to update database." % ARCHIVES_PATH) return False APP_LOGGER.info("Database successfully updated with available archives.") return True
def update_dyes(): ''' Update the database with available dyes. @return True if database is successfully updated, False otherwise ''' APP_LOGGER.info("Updating database with available dyes...") try: records = [{DYE: dye} for dye in _DATASTORE.dyes()] assert len(records) > 0, "Internal error: No dyes found" # There is a possible race condition here. Ideally these operations # would be performed in concert atomically _DB_CONNECTOR.remove(DYES_COLLECTION, {}) _DB_CONNECTOR.insert(DYES_COLLECTION, records) except: APP_LOGGER.info("Failed to update database with available dyes: %s", str(sys.exc_info())) raise APP_LOGGER.info("Database successfully updated with available dyes.") return True
def process_request(cls, params_dict): filenames = params_dict[cls.filenames_parameter] report_uuid = params_dict[cls.report_uuid_parameter][0] http_status_code = 200 json_response = {RUN_REPORT_UUID: report_uuid, FILENAMES: filenames} filepaths = [ os.path.join(MODIFIED_ARCHIVES_PATH, secure_filename(fn)) for fn in filenames ] if not filenames or not report_uuid or not all( allowed_file(fp) for fp in filepaths): http_status_code = 400 elif any( cls._DB_CONNECTOR.find_one(HDF5_COLLECTION, HDF5_PATH, {'$regex': fn + '$'}) is not None for fn in filenames): http_status_code = 403 else: try: fp_to_datasets, duplicate = get_datasets_from_files(filepaths) if not fp_to_datasets or duplicate: http_status_code = 403 else: new_hdf5_records = [{ HDF5_PATH: fp, HDF5_DATASET: dsname, "upload": True } for fp in fp_to_datasets for dsname in fp_to_datasets[fp]] cls._DB_CONNECTOR.insert(HDF5_COLLECTION, new_hdf5_records) APP_LOGGER.info('Updated database with %d new HDF5 files' % len(new_hdf5_records)) run_report = cls._DB_CONNECTOR.find_one( RUN_REPORT_COLLECTION, UUID, report_uuid) if run_report: exist_datasets = set([ d for d in run_report[IMAGE_STACKS] if isinstance(d, str) or isinstance(d, unicode) ]) new_datasets = set() for datasets in fp_to_datasets.values(): new_datasets = new_datasets | datasets new_datasets = list(new_datasets - exist_datasets) if new_datasets: cls._DB_CONNECTOR.update( RUN_REPORT_COLLECTION, {UUID: report_uuid}, { '$addToSet': { IMAGE_STACKS: { '$each': [{ 'name': d, 'upload': True } for d in new_datasets] } } }) APP_LOGGER.info( "Updated run report uuid=%s with %d HDF5 datasets." % (report_uuid, len(new_datasets))) del run_report[ID] json_response.update({ "run_report": run_report, "uploaded": new_datasets }) else: json_response.update({ "error": "Run report uuid=%s does not exist." % report_uuid }) except: APP_LOGGER.exception(traceback.format_exc()) json_response[ERROR] = str(sys.exc_info()[1]) http_status_code = 500 return make_clean_response(json_response, http_status_code)
def get_run_reports(cartridge_sn=None): """ Retrieve a list of run reports. """ columns = OrderedDict() columns[ID] = 0 columns[UUID] = 1 columns[DATETIME] = 1 columns[DEVICE_NAME] = 1 columns[EXP_DEF_NAME] = 1 columns[RUN_DESCRIPTION] = 1 columns[SAMPLE_NAME] = 1 columns[CARTRIDGE_SN] = 1 columns[CARTRIDGE_SN_OLD] = 1 columns[CARTRIDGE_BC] = 1 columns[IMAGE_STACKS] = 1 columns[PICO1_DYE] = 1 columns[EXPERIMENT_CONFIGS] = 1 columns[TAGS] = 1 column_names = columns.keys() column_names.remove(ID) query = { UUID: { '$exists': True }, DEVICE_NAME: { '$ne': '' }, EXP_DEF_NAME: { '$ne': None }, IMAGE_STACKS: { '$ne': None, '$not': { '$size': 0 } } } if cartridge_sn is not None: query.update({ '$or': [{ CARTRIDGE_SN: cartridge_sn }, { CARTRIDGE_SN_OLD: cartridge_sn }, { '{0}.{1}'.format(CARTRIDGE_BC, 'serial_num'): cartridge_sn }] }) reports = _DB_CONNECTOR.find(RUN_REPORT_COLLECTION, query, columns) APP_LOGGER.info('Retrieved %d run reports with image stack(s)' \ % (len(reports), )) if reports: all_jobs = _DB_CONNECTOR.find(FA_PROCESS_COLLECTION, {}) job_map = defaultdict(list) for job in all_jobs: job_map[job[ARCHIVE]].append(job) for report in reports: report[DATA_TO_JOBS] = dict() for archive in report[IMAGE_STACKS]: archive_name = archive['name'] if isinstance(archive, dict) else archive job_status = {STATUS: 'not processed', 'job_uuids': list()} jobs = job_map[ archive_name] if archive_name in job_map else list() if jobs: if any(j[STATUS] == RUNNING for j in jobs): job_status[STATUS] = RUNNING elif any(j[STATUS] == SUBMITTED for j in jobs): job_status[STATUS] = SUBMITTED elif any(j[STATUS] == SUCCEEDED for j in jobs): job_status[STATUS] = SUCCEEDED else: job_status[STATUS] = FAILED job_status['job_uuids'] = [j[UUID] for j in jobs] report[DATA_TO_JOBS][archive_name] = job_status return (reports, column_names, None)
def update_run_reports(date_folders=None): ''' Update the database with available run reports. It is not an error if zero reports are available at this moment. @return True if database is successfully updated, False otherwise ''' APP_LOGGER.info("Updating database with available run reports...") # fetch utags from run report collection db_utags = _DB_CONNECTOR.distinct(RUN_REPORT_COLLECTION, UTAG) if os.path.isdir(RUN_REPORT_PATH): if date_folders is None: try: latest_date = _DB_CONNECTOR.find_max(RUN_REPORT_COLLECTION, DATETIME)[DATETIME] except TypeError: latest_date = datetime.now() def valid_date(folder): date_obj = get_date_object(folder) return date_obj >= latest_date - timedelta(days=6) date_folders = [ folder for folder in os.listdir(RUN_REPORT_PATH) if re.match('\d{2}_\d{2}_\d{2}', folder) and valid_date(folder) ] # New file location new_date_folders = get_date_folders() date_folders.extend(f for f in new_date_folders if valid_date(f)) date_folders = [os.path.join(RUN_REPORT_PATH, f) for f in date_folders] date_folders = [f for f in date_folders if os.path.isdir(f)] reports = list() for folder in date_folders: for sf in os.listdir(folder): report_file_path = get_run_info_path(folder, sf) if report_file_path is None: continue date_obj = get_date_object(folder) data_folder = os.path.join(RUN_REPORT_PATH, folder, sf) utag = set_utag(date_obj, sf) if utag not in db_utags: # if not exists, need to insert to collection log_data = read_report_file(report_file_path, date_obj, utag) if log_data is None or all( not log_data[DEVICE_NAME].lower().startswith(x) for x in ['pilot', 'beta']): log_data = {DATETIME: date_obj, UTAG: utag} if IMAGE_STACKS in log_data: # add image stacks to archive collection update_image_stacks(log_data, data_folder) # find HDF5 datasets and add them to HDF5 collection hdf5_datasets = get_hdf5_datasets( log_data, data_folder) log_data[IMAGE_STACKS].extend(hdf5_datasets) # add report direcotry path log_data[DIR_PATH] = remove_disk_directory( os.path.dirname(report_file_path)) reports.append(log_data) else: # if exists, check HDF5 collection for new datasets log_data = _DB_CONNECTOR.find_one(RUN_REPORT_COLLECTION, UTAG, utag) # If previously a run report was not there or had wrong format, # the mongo documents only has three or four fields, _id, datetime, # unique_tag, and maybe dir_path. If this occurs, try reading the # run report again. if not set(log_data.keys()) - set( [ID, DATETIME, UTAG, DIR_PATH]): log_data = read_report_file(report_file_path, date_obj, utag) if log_data is None or all( not log_data[DEVICE_NAME].lower().startswith(x) for x in ['pilot', 'beta']): continue # add report direcotry path log_data[DIR_PATH] = remove_disk_directory( os.path.dirname(report_file_path)) # add image stacks to archive collection update_image_stacks(log_data, data_folder) if IMAGE_STACKS in log_data: # find HDF5 datasets and add new records to HDF5 collection new_datasets = set( get_hdf5_datasets(log_data, data_folder)) if new_datasets: # exclude uploaded HDF5 datasets exist_datasets = set([ d for d in log_data[IMAGE_STACKS] if isinstance(d, str) or isinstance(d, unicode) ]) new_datasets = list(new_datasets - exist_datasets) if new_datasets: _DB_CONNECTOR.update( RUN_REPORT_COLLECTION, {UTAG: utag}, { "$addToSet": { IMAGE_STACKS: { '$each': new_datasets } } }) APP_LOGGER.info( 'Updated run report utag=%s with %d datasets' % (utag, len(new_datasets))) APP_LOGGER.info("Found %d run reports" % (len(reports))) if len(reports) > 0: # There is a possible race condition here. Ideally these operations # would be performed in concert atomically _DB_CONNECTOR.insert(RUN_REPORT_COLLECTION, reports) else: APP_LOGGER.error( "Couldn't locate run report path '%s', to update database." % RUN_REPORT_PATH) return False APP_LOGGER.info( "Database successfully updated with available run reports.") return True
def update_hdf5s(): APP_LOGGER.info("Updating database with available HDF5 files...") # check if run report path exists if not os.path.isdir(RUN_REPORT_PATH): APP_LOGGER.error( "Couldn't locate run report path '%s', to update database." % RUN_REPORT_PATH) return False # find new hdf5 files, using nested listdirs, way faster than glob, os.walk, or scandir # only search two subdirectories within the run report folder # assumes each the hdf5 file is in a subfolder in the run report folder database_paths = set( _DB_CONNECTOR.distinct_sorted(HDF5_COLLECTION, HDF5_PATH)) current_paths = set() for par_ in os.listdir(RUN_REPORT_PATH): report_dir = os.path.join(RUN_REPORT_PATH, par_) if os.path.isdir(report_dir): for sub_ in os.listdir(report_dir): subdir = os.path.join(report_dir, sub_) if os.path.isdir(subdir): hdf5s = [ f for f in os.listdir(subdir) if os.path.splitext(f)[-1] in VALID_HDF5_EXTENSIONS ] hdf5_paths = [os.path.join(subdir, f) for f in hdf5s] current_paths.update(hdf5_paths) # Check yyyy_mm/dd/HHMM_pilotX location run_folders = get_run_folders() for folder in run_folders: hdf5s = [ f for f in os.listdir(folder) if os.path.splitext(f)[-1] in VALID_HDF5_EXTENSIONS ] hdf5_paths = [os.path.join(folder, f) for f in hdf5s] current_paths.update(hdf5_paths) # update database with any new files new_hdf5_paths = current_paths - database_paths new_records = list() for hdf5_path in new_hdf5_paths: try: with h5py.File(hdf5_path) as h5_file: dataset_names = h5_file.keys() for dsname in dataset_names: if any( re.match(pat, dsname) for pat in [ r'^\d{4}-\d{2}-\d{2}_\d{4}\.\d{2}', r'^Pilot\d+_\d{4}-\d{2}-\d{2}_\d{4}\.\d{2}' ]): new_records.append({ HDF5_PATH: remove_disk_directory(hdf5_path), HDF5_DATASET: dsname, }) except: APP_LOGGER.exception( 'Unable to get dataset information from HDF5 file: %s' % hdf5_path) if new_records: # There is a possible race condition here. Ideally these operations # would be performed in concert atomically _DB_CONNECTOR.insert(HDF5_COLLECTION, new_records) APP_LOGGER.info('Updated database with %s new HDF5 files' % len(new_records)) else: APP_LOGGER.info('Unable to find any new HDF5 files') return True