def process_genotyping_manifest_file_from_bucket(bucket_name, genotyping_folder_name): bucket_stat_list = cloudstorage_api.listbucket('/' + bucket_name) if not bucket_stat_list: logging.info('No files in cloud bucket %r.' % bucket_name) return None bucket_stat_list = [ s for s in bucket_stat_list if s.filename.lower().endswith('.csv') and '%s' % genotyping_folder_name in s.filename ] if not bucket_stat_list: logging.info('No CSVs in cloud bucket %r folder %r (all files: %s).' % (bucket_name, genotyping_folder_name, bucket_stat_list)) return None bucket_stat_list.sort(key=lambda s: s.st_ctime) path = bucket_stat_list[-1].filename timestamp = datetime.datetime.utcfromtimestamp( bucket_stat_list[-1].st_ctime) csv_file = cloudstorage_api.open(path) logging.info('Opening latest genotyping manifest CSV in %r: %r.', bucket_name + '/' + genotyping_folder_name, path) now = clock.CLOCK.now() if now - timestamp > _MAX_INPUT_AGE: logging.info( 'Input %r (timestamp %s UTC) is > 24h old (relative to %s UTC), not processing.' % (path, timestamp, now)) return None update_sample_info_from_genotyping_manifest_file(csv_file)
def process_genomic_manifest_result_file_from_bucket(): bucket_name = config.getSetting(config.BIOBANK_SAMPLES_BUCKET_NAME) result_folder_name = config.getSetting(GENOMIC_BIOBANK_MANIFEST_RESULT_FOLDER_NAME) bucket_stat_list = cloudstorage_api.listbucket('/' + bucket_name) if not bucket_stat_list: logging.info('No files in cloud bucket %r.' % bucket_name) return None bucket_stat_list = [s for s in bucket_stat_list if s.filename.lower().endswith('.csv') and '%s' % result_folder_name in s.filename] if not bucket_stat_list: logging.info( 'No CSVs in cloud bucket %r folder %r (all files: %s).' % (bucket_name, result_folder_name, bucket_stat_list)) return None bucket_stat_list.sort(key=lambda s: s.st_ctime) path = bucket_stat_list[-1].filename csv_file = cloudstorage_api.open(path) filename = path.replace('/' + bucket_name + '/' + result_folder_name + '/', '') logging.info('Opening latest genomic manifest result CSV in %r: %r.', bucket_name + '/' + result_folder_name, path) timestamp = timestamp_from_filename(filename) now = clock.CLOCK.now() if now - timestamp > _MAX_INPUT_AGE: logging.info('Input %r (timestamp %s UTC) is > 24h old (relative to %s UTC), not processing.' % (filename, timestamp, now)) print('Input %r (timestamp %s UTC) is > 24h old (relative to %s UTC), not processing.' % (filename, timestamp, now)) return None genomic_set_id = _get_genomic_set_id_from_filename(filename) update_package_id_from_manifest_result_file(genomic_set_id, csv_file)
def _find_latest_genomic_set_csv(self, cloud_bucket_name, keyword=None): bucket_stat_list = cloudstorage_api.listbucket('/' + cloud_bucket_name) if not bucket_stat_list: raise RuntimeError('No files in cloud bucket %r.' % cloud_bucket_name) bucket_stat_list = [ s for s in bucket_stat_list if s.filename.lower().endswith('.csv') ] if not bucket_stat_list: raise RuntimeError('No CSVs in cloud bucket %r (all files: %s).' % (cloud_bucket_name, bucket_stat_list)) if keyword: buckt_stat_keyword_list = [] for item in bucket_stat_list: if keyword in item.filename: buckt_stat_keyword_list.append(item) if buckt_stat_keyword_list: buckt_stat_keyword_list.sort(key=lambda s: s.st_ctime) return buckt_stat_keyword_list[-1].filename else: raise RuntimeError( 'No CSVs in cloud bucket %r with keyword %s (all files: %s).' % (cloud_bucket_name, keyword, bucket_stat_list)) bucket_stat_list.sort(key=lambda s: s.st_ctime) return bucket_stat_list[-1].filename
def update_ehr_status(): """ Entrypoint, executed as a cron job """ now = clock.CLOCK.now() cutoff_date = (now - datetime.timedelta(days=1)).date() bucket_name = _get_curation_bucket_name() try: organization_info_list = _get_organization_info_list( cloudstorage_api.listbucket('/' + bucket_name), cutoff_date) except config.MissingConfigException as e: LOG.info(str(e)) return for org_info in organization_info_list: deferred.defer(_do_update_for_organization, *org_info)
def _find_latest_genomic_set_csv(cloud_bucket_name): """Returns the full path (including bucket name) of the most recently created CSV in the bucket. Raises: RuntimeError: if no CSVs are found in the cloud storage bucket. """ bucket_stat_list = cloudstorage_api.listbucket('/' + cloud_bucket_name) if not bucket_stat_list: raise FileNotFoundError('No files in cloud bucket %r.' % cloud_bucket_name) # GCS does not really have the concept of directories (it's just a filename convention), so all # directory listings are recursive and we must filter out subdirectory contents. bucket_stat_list = [s for s in bucket_stat_list if s.filename.lower().endswith('.csv') and '%s' % _RESULT_FILE_SUFFIX not in s.filename] if not bucket_stat_list: raise FileNotFoundError( 'No CSVs in cloud bucket %r (all files: %s).' % (cloud_bucket_name, bucket_stat_list)) bucket_stat_list.sort(key=lambda s: s.st_ctime) return bucket_stat_list[-1].filename