def get_existing_items_from_db(connection, itype, include_invisible=True): """ Retrieves all existing items of itype from db and returns a generator by default includes deleted and restricted terms which are usually filtered from search results include_invisible=False excludes deleted and restricted return Generator of item dicts """ invisible_stati = ['deleted', 'replaced'] gens = [] search_suffix = 'search/?type={}'.format(itype) gens.append( search_metadata(search_suffix, connection, page_limit=200, is_generator=True)) if include_invisible: for istatus in invisible_stati: search_suffix += '&status={}'.format(istatus) gens.append( search_metadata(search_suffix, connection, page_limit=200, is_generator=True)) for gen in gens: yield from gen
def exp_has_raw_files(connection, **kwargs): ''' Check for sequencing experiments that don't have raw files Action patches badges ''' check = CheckResult(connection, 'exp_has_raw_files') # search all experiments except microscopy experiments for missing files field no_files = ff_utils.search_metadata('search/?type=Experiment&%40type%21=ExperimentMic&files.uuid=No+value', key=connection.ff_keys) # also check sequencing experiments whose files items are all uploading/archived/deleted bad_status = ff_utils.search_metadata('search/?status=uploading&status=archived&status=deleted&status=upload+failed' '&type=FileFastq&experiments.uuid%21=No+value', key=connection.ff_keys) bad_status_ids = {item['@id']: item['status'] for item in bad_status} exps = list(set([exp['@id'] for fastq in bad_status for exp in fastq.get('experiments') if fastq.get('experiments')])) missing_files_released = [e['@id'] for e in no_files if e.get('status') not in REV] missing_files_in_rev = [e['@id'] for e in no_files if e.get('status') in REV] for expt in exps: result = ff_utils.get_metadata(expt, key=connection.ff_keys) raw_files = False if result.get('files'): for fastq in result.get('files'): if fastq['@id'] not in bad_status_ids or result['status'] == bad_status_ids[fastq['@id']]: raw_files = True break if not raw_files: if result.get('status') in REV: missing_files_in_rev.append(expt) else: missing_files_released.append(expt) to_add, to_remove, ok = compare_badges(missing_files_released, 'Experiment', 'no-raw-files', connection.ff_keys) if to_add or to_remove: check.status = 'WARN' check.summary = 'Raw Files badges need patching' check.description = '{} sequencing experiments need raw files badges patched'.format( len(to_add) + len(to_remove) ) check.allow_action = True else: check.status = 'PASS' check.summary = 'Raw Files badges up-to-date' check.description = 'No sequencing experiments need raw files badges patched' check.action = 'patch_badges_for_raw_files' check.full_output = {'Add badge': to_add, 'Remove badge': to_remove, 'Keep badge': ok} check.brief_output = {REV_KEY: missing_files_in_rev, RELEASED_KEY: {'Add badge': to_add, 'Remove badge': to_remove}} return check
def check_validation_errors(connection, **kwargs): ''' Counts number of items in fourfront with schema validation errors, returns link to search if found. ''' check = CheckResult(connection, 'check_validation_errors') search_url = 'search/?validation_errors.name!=No+value&type=Item' results = ff_utils.search_metadata(search_url + '&field=@id', key=connection.ff_keys) if results: types = { item for result in results for item in result['@type'] if item != 'Item' } check.status = 'WARN' check.summary = 'Validation errors found' check.description = ( '{} items found with validation errors, comprising the following ' 'item types: {}. \nFor search results see link below.'.format( len(results), ', '.join(list(types)))) check.ff_link = connection.ff_server + search_url else: check.status = 'PASS' check.summary = 'No validation errors' check.description = 'No validation errors found.' return check
def page_children_routes(connection, **kwargs): check = CheckResult(connection, 'page_children_routes') page_search = 'search/?type=Page&format=json&children.name%21=No+value' results = ff_utils.search_metadata(page_search, key=connection.ff_keys) problem_routes = {} for result in results: if result['name'] != 'resources/data-collections': bad_children = [ child['name'] for child in result['children'] if child['name'] != result['name'] + '/' + child['name'].split('/')[-1] ] if bad_children: problem_routes[result['name']] = bad_children if problem_routes: check.status = 'WARN' check.summary = 'Pages with bad routes found' check.description = ( '{} child pages whose route is not a direct sub-route of parent' ''.format(sum([len(val) for val in problem_routes.values()]))) else: check.status = 'PASS' check.summary = 'No pages with bad routes' check.description = 'All routes of child pages are a direct sub-route of parent page' check.full_output = problem_routes return check
def compare_badges_and_messages(obj_id_dict, item_type, badge, ff_keys): ''' Compares items that should have a given badge to items that do have the given badge. Also compares badge messages to see if the message is the right one or needs to be updated. Input (first argument) should be a dictionary of item's @id and the badge message it should have. ''' search_url = 'search/?type={}&badges.badge.@id=/badges/{}/'.format(item_type, badge) has_badge = ff_utils.search_metadata(search_url + '&frame=object', key=ff_keys) needs_badge = {} badge_edit = {} badge_ok = [] remove_badge = {} for item in has_badge: if item['@id'] in obj_id_dict.keys(): # handle differences in badge messages for a_badge in item['badges']: if a_badge['badge'].endswith(badge + '/'): if a_badge.get('messages') == obj_id_dict[item['@id']]: badge_ok.append(item['@id']) else: if a_badge.get('message'): del a_badge['message'] a_badge['messages'] = obj_id_dict[item['@id']] badge_edit[item['@id']] = item['badges'] break else: this_badge = [a_badge for a_badge in item['badges'] if badge in a_badge['badge']][0] item['badges'].remove(this_badge) remove_badge[item['@id']] = item['badges'] for key, val in obj_id_dict.items(): if key not in badge_ok + list(badge_edit.keys()): needs_badge[key] = val return needs_badge, remove_badge, badge_edit, badge_ok
def __init__(self, ff_keys=None, ffe_all=None): """connect to the server and get all fileformat search result if ff_keys if given. If not, use user-specified ffe_all """ if not ff_keys and not ffe_all: raise Exception("Either ff_keys or ffe_all must be specified" + \ "to create a FormatExtensionMap object") if ff_keys and ffe_all: raise Exception("Either ff_keys or ffe_all must be specified but not both" + \ "to create a FormatExtensionMap object") if ff_keys and not ffe_all: try: logger.debug("Searching in server : " + ff_keys['server']) ffe_all = search_metadata( "/search/?type=FileFormat&frame=object", key=ff_keys) except Exception as e: raise Exception( "Can't get the list of FileFormat objects. %s\n" % e) self.fe_dict = dict() logger.debug("**ffe_all = " + str(ffe_all)) for k in ffe_all: file_format = k['file_format'] self.fe_dict[file_format] = \ {'standard_extension': k['standard_file_extension'], 'other_allowed_extensions': k.get('other_allowed_extensions', []), 'extrafile_formats': k.get('extrafile_formats', []) }
def compare_existing_to_newly_generated(logger, connection, evidence_items, itype): """ gets all the existing evidence items from database and compares to all the newly generated ones from annotations and if found removes from list """ sq = 'search/?type={}&status!=obsolete'.format(itype) logger.info("COMPARING FILE ITEMS WITH CURRENT DB CONTENT") logger.info("searching: {}".format( datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"))) dbitems = search_metadata(sq, connection, is_generator=True, page_limit=500) existing = 0 uids2obsolete = [] logger.info("comparing: {}".format( datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"))) for db_evi in dbitems: # import pdb; pdb.set_trace() tochk = convert2raw(db_evi) if tochk in evidence_items: existing += 1 evidence_items.remove(tochk) else: uids2obsolete.append(db_evi.get('uuid')) logger.info("result: {}".format( datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"))) return evidence_items, existing, uids2obsolete
def safe_search_with_callback(fdn_conn, query, container, callback, limit=20, frame='embedded'): """ Somewhat temporary function to avoid making search queries that cause memory issues. Takes a ff_utils fdn_conn, a search query (without 'limit' or 'from' parameters), a container to put search results in after running them through a given callback function, which should take a search hit as its first parameter and the container as its second parameter. """ last_total = None curr_from = 0 while not last_total or last_total == limit: print('...', curr_from) search_query = ''.join( [query, '&from=', str(curr_from), '&limit=', str(limit)]) search_res = search_metadata(search_query, connection=fdn_conn, frame=frame) if not search_res: # 0 results break last_total = len(search_res) curr_from += last_total for hit in search_res: callback(hit, container)
def experiment_set_reporting_data(connection, **kwargs): """ Get a snapshot of all experiment sets, their experiments, and files of all of the above. Include uuid, accession, status, and md5sum (for files). """ check = CheckResult(connection, 'experiment_set_reporting_data') check.status = 'IGNORE' exp_sets = {} search_query = '/search/?type=ExperimentSetReplicate&experimentset_type=replicate&sort=-date_created' set_hits = ff_utils.search_metadata(search_query, key=connection.ff_keys, page_limit=20) # run a second search for status=deleted and status=replaced set_hits_del = ff_utils.search_metadata(search_query + '&status=deleted&status=replaced', key=connection.ff_keys, page_limit=20) set_hits.extend(set_hits_del) for hit in set_hits: add_to_report(hit, exp_sets) check.full_output = exp_sets return check
def get_existing_ontology_terms(connection, ontologies=None): '''Retrieves all existing ontology terms from the db ''' ont_list = '' if ontologies is not None: for o in ontologies: ouuid = o.get('uuid') ont_list += '&source_ontology.uuid={}'.format(ouuid) search_suffix = 'search/?type=OntologyTerm' + ont_list db_terms = search_metadata(search_suffix, connection, page_limit=200, is_generator=True) return {t['term_id']: t for t in db_terms}
def get_item_ids_from_args(id_input, auth, is_search=False): '''depending on the args passed return a list of item ids''' if is_search: query = 'search/?' + id_input[0] result = search_metadata(query, auth, is_generator=True) return [r.get('uuid') for r in result] try: with open(id_input[0]) as inf: return [x.strip() for x in inf] # pragma: no cover except FileNotFoundError: return id_input
def paired_end_info_consistent(connection, **kwargs): ''' Check that fastqs with a paired_end number have a paired_with related_file, and vice versa ''' check = CheckResult(connection, 'paired_end_info_consistent') search1 = 'search/?type=FileFastq&file_format.file_format=fastq&related_files.relationship_type=paired+with&paired_end=No+value' search2 = 'search/?type=FileFastq&file_format.file_format=fastq&related_files.relationship_type!=paired+with&paired_end%21=No+value' results1 = ff_utils.search_metadata(search1 + '&frame=object', key=connection.ff_keys) results2 = ff_utils.search_metadata(search2 + '&frame=object', key=connection.ff_keys) results = { 'paired with file missing paired_end number': [result1['@id'] for result1 in results1], 'file with paired_end number missing "paired with" related_file': [result2['@id'] for result2 in results2] } if [val for val in results.values() if val]: check.status = 'WARN' check.summary = 'Inconsistencies found in FileFastq paired end info' check.description = ( '{} files found with a "paired with" related_file but missing a paired_end number; ' '{} files found with a paired_end number but missing related_file info' ''.format( len(results['paired with file missing paired_end number']), len(results[ 'file with paired_end number missing "paired with" related_file'] ))) else: check.status = 'PASS' check.summary = 'No inconsistencies in FileFastq paired end info' check.description = 'All paired end fastq files have both paired end number and "paired with" related_file' check.full_output = results check.brief_output = [item for val in results.values() for item in val] return check
def search_result(params): ''' Assuming the <KEYNAME> in the <keyfilename> is a valid admin key for cgapwolf. Perform a search based on params, e.g. {"type": "Gene"} and return result. ''' keyfilename = path.expanduser("~") + '/keypairs.json' with open(keyfilename) as keyfile: keys = json.load(keyfile) key = keys[KEYNAME] base_url = "/search/" query = "%s?%s" % (base_url, urlencode(params)) result = ff_utils.search_metadata(query, key=key) return result
def batch_fastqc(env, batch_size=20): ''' try to run fastqc on everythign that needs it ran ''' files_processed = 0 files_skipped = 0 # handle ctrl-c import signal def report(signum, frame): print("Processed %s files, skipped %s files" % (files_processed, files_skipped)) sys.exit(-1) signal.signal(signal.SIGINT, report) tibanna = Tibanna(env=env) uploaded_files = search_metadata( "search/?type=File&status=uploaded&limit=%s" % batch_size, key=tibanna.ff_key, ff_env=tibanna.env) # TODO: need to change submit 4dn to not overwrite my limit if len(uploaded_files['@graph']) > batch_size: limited_files = uploaded_files['@graph'][:batch_size] else: limited_files = uploaded_files['@graph'] for ufile in limited_files: fastqc_run = False for wfrun in ufile.get('workflow_run_inputs', []): if 'fastqc' in wfrun: fastqc_run = True if not fastqc_run: print("running fastqc for %s" % ufile.get('accession')) run_fastqc(env, ufile.get('accession'), ufile.get('uuid')) files_processed += 1 else: print("******** fastqc already run for %s skipping" % ufile.get('accession')) files_skipped += 1 sleep(5) if files_processed % 10 == 0: sleep(60) print("Processed %s files, skipped %s files" % (files_processed, files_skipped))
def __init__(self, ff_keys): try: printlog("Searching in server : " + ff_keys['server']) ffe_all = search_metadata("/search/?type=FileFormat&frame=object", key=ff_keys) except Exception as e: raise Exception("Can't get the list of FileFormat objects. %s\n" % e) self.fe_dict = dict() printlog("**ffe_all = " + str(ffe_all)) for k in ffe_all: file_format = k['file_format'] self.fe_dict[file_format] = \ {'standard_extension': k['standard_file_extension'], 'other_allowed_extensions': k.get('other_allowed_extensions', []), 'extrafile_formats': k.get('extrafile_formats', []) }
def get_ontologies(connection, ont_list): '''return list of ontology jsons retrieved from server ontology jsons are now fully embedded ''' ontologies = [] if ont_list == 'all': ontologies = search_metadata('search/?type=Ontology', connection) else: ontologies = [get_metadata('ontologys/' + ontology, connection) for ontology in ont_list] # removing item not found cases with reporting if not isinstance(ontologies, (list, tuple)): print("we must not have got ontolgies... bailing") import sys sys.exit() for i, ontology in enumerate(ontologies): if 'Ontology' not in ontology['@type']: ontologies.pop(i) return ontologies
def __init__(self, connection, schema_name): uri = '/profiles/' + schema_name + '.json' response = ff_utils.get_metadata(uri, key=connection.key, add_on="frame=object") self.required = None if 'required' in response: self.required = response['required'] if schema_name in file_types and response['properties'].get( 'file_format'): q = '/search/?type=FileFormat&field=file_format&valid_item_types={}'.format( schema_name) formats = [ i['file_format'] for i in ff_utils.search_metadata(q, key=connection.key) ] response['properties']['file_format']['enum'] = formats self.properties = response['properties']
def get_slim_terms(connection): '''Retrieves ontology_term jsons for those terms that have 'is_slim_for' field populated ''' # currently need to hard code the categories of slims but once the ability # to search all can add parameters to retrieve all or just the terms in the # categories passed as a list slim_categories = ['developmental', 'assay', 'organ', 'system', 'cell'] search_suffix = 'search/?type=OntologyTerm&is_slim_for=' slim_terms = [] for cat in slim_categories: try: terms = search_metadata(search_suffix + cat, connection) slim_terms.extend(terms) except TypeError as e: print(e) continue return slim_terms
def compare_badges(obj_ids, item_type, badge, ff_keys): ''' Compares items that should have a given badge to items that do have the given badge. Used for badges that utilize a single message choice. Input (first argument) should be a list of item @ids. ''' search_url = 'search/?type={}&badges.badge.@id=/badges/{}/'.format(item_type, badge) has_badge = ff_utils.search_metadata(search_url + '&frame=object', key=ff_keys) needs_badge = [] badge_ok = [] remove_badge = {} for item in has_badge: if item['@id'] in obj_ids: # handle differences in badge messages badge_ok.append(item['@id']) else: keep = [badge_dict for badge_dict in item['badges'] if badge not in badge_dict['badge']] remove_badge[item['@id']] = keep for other_item in obj_ids: if other_item not in badge_ok: needs_badge.append(other_item) return needs_badge, remove_badge, badge_ok
def gold_biosamples(connection, **kwargs): ''' Gold level commendation criteria: 1. Tier 1 or Tier 2 Cells obtained from the approved 4DN source and grown precisely according to the approved SOP including any additional authentication (eg. HAP-1 haploid line requires ploidy authentication). 2. All required metadata present (does not have a biosample warning badge). ''' check = CheckResult(connection, 'gold_biosamples') search_url = ('search/?biosource.cell_line_tier=Tier+1&biosource.cell_line_tier=Tier+2' '&type=Biosample&badges.badge.warning=No+value') results = ff_utils.search_metadata(search_url, key=connection.ff_keys) gold = [] for result in results: # follows SOP w/ no deviations sop = True if all([bcc.get('follows_sop', '') == 'Yes' for bcc in result.get('cell_culture_details', [])]) else False if sop and result.get('status') not in REV: gold.append(result['@id']) to_add, to_remove, ok = compare_badges(gold, 'Biosample', 'gold-biosample', connection.ff_keys) check.action = 'patch_gold_biosample_badges' if to_add or to_remove: check.status = 'WARN' check.summary = 'Gold biosample badges need patching' check.description = '{} biosamples need gold badges patched. '.format(len(to_add) + len(to_remove.keys())) check.description += 'Yellow_flag_biosamples check must pass before patching.' yellow_check = CheckResult(connection, 'yellow_flag_biosamples') latest_yellow = yellow_check.get_latest_result() if latest_yellow['status'] == 'PASS': check.allow_action = True else: check.status = 'PASS' check.summary = 'Gold biosample badges up-to-date' check.description = 'No gold biosample badges need patching' check.full_output = {'Add badge': to_add, 'Remove badge': to_remove, 'Keep badge (no change)': ok} return check
def get_latest_tracking_item_date(self, increment="daily"): """ Queries '/search/?type=TrackingItem&sort=-google_analytics.for_date&&google_analytics.date_increment=...' to get date of last TrackingItem for increment in database. TODO: Accept yearly once we want to collect & viz it. """ if increment not in ('daily', 'monthly'): raise IndexError("increment parameter must be one of 'daily', 'monthly'") search_results = ff_utils.search_metadata( '/search/?type=TrackingItem&tracking_type=google_analytics&sort=-google_analytics.for_date&limit=1&google_analytics.date_increment=' + increment, key=dict(self.owner.access_key, server=self.owner.server), page_limit=1 ) if len(search_results) == 0: return None iso_date = search_results[0]['google_analytics']['for_date'] # TODO: Use date.fromisoformat() once we're on Python 3.7 year, month, day = iso_date.split('-', 2) # In python, months are indexed from 1 <= month <= 12, not 0 <= month <= 11 like in JS. return date(int(year), int(month), int(day))
def main(): # getting authentication keys args = get_args() try: auth = ff_utils.get_authentication_with_server(args.key) except Exception as e: print("Authentication failed", e) sys.exit(1) dryrun = args.dryrun if dryrun: print("\nThis is a dry run\n") # collecting publication and expset search results hic_types = [ 'in+situ+Hi-C', 'Dilution+Hi-C', 'DNase+Hi-C', 'Micro-C', 'TCC' ] query_pub = '/search/?type=Publication' query_exp = '/search/?type=ExperimentSetReplicate&status=released' for type in hic_types: query_pub += '&exp_sets_prod_in_pub.experiments_in_set.experiment_type.display_title=' + type query_exp += '&experiments_in_set.experiment_type.display_title=' + type pubs_search = ff_utils.search_metadata(query_pub, key=auth) expsets_search = ff_utils.search_metadata(query_exp, key=auth) # building publications dictionary pubs_dict = convert_pubs_list_to_lookup(pubs_search) # loading dataset groups from json file repo_path = Path(__file__).resolve().parents[1] dsg_filename = repo_path.joinpath('files', 'dsg.json') if dsg_filename.exists(): with open(dsg_filename) as dsg_fn: dsgs = json.load(dsg_fn) else: sys.exit("ERROR: Dataset grouping file not found") # making dataset list and mapping to dataset group dataset_list = [] datasets_of_dsg = {} for k, v in dsgs.items(): if v.get("datasets"): dataset_list.extend(v["datasets"]) datasets_of_dsg[k] = v["datasets"] else: # if a dsg does not have datasets, then the dsg itself is the dataset dataset_list.append(k) # building the output table table = {} new_datasets = set() study_groups = set() for expset in expsets_search: dataset = expset.get("dataset_label") if dataset not in dataset_list: new_datasets.add(dataset) continue dsg = dataset dsg_link = "dataset_label=" + dataset for group, elements in datasets_of_dsg.items(): if dataset in elements: dsg_link = ("dataset_label=" + "&dataset_label=".join(elements)) dsg = group break dsg_link = "/browse/?" + dsg_link.replace("+", "%2B").replace( "/", "%2F").replace(" ", "+") study_groups.add(dsgs[dsg].get("study_group")) row = table.get(dsg, {}) table[dsg] = assemble_data_for_the_row(row, expset, dsg, dsg_link, pubs_dict, dsgs[dsg]) # summarize number of experiment sets of each experiment type in a string for dsg, row in table.items(): exp_type_summary = "" for exp_type, count in row["Replicate Sets"].items(): if count > 0: exp_type_summary += str(count) + " " + exp_type + "<br>" if len(exp_type_summary) > 0: row['Replicate Sets'] = exp_type_summary[: -4] #remove <br> at the end else: row['Replicate Sets'] = "" # if new datasets are not in the json, ask what to do if new_datasets: print("New datasets found (not present in the json file):") for ds in new_datasets: print(ds) print("(i)gnore datasets or (e)xit to manually add them? [i/e]") response = None while response not in ['i', 'e']: response = input() if response == 'e': sys.exit("Add new dataset to dsg.json before generating table") # patch the static section for each study group skipped = [] posted = [] patched = [] for studygroup in list(study_groups): # prepare static section table_dsg = {} for dsg in dsgs: if table.get(dsg): if table[dsg].get("Class") != studygroup: continue else: table_dsg[dsg] = table.get(dsg) keys = [ 'Data Set', 'Project', 'Replicate Sets', 'Species', 'Biosources', 'Publication', 'Study', 'Lab' ] if studygroup == "Single Time Point and Condition": keys.remove('Study') name = alias = output = filetype = None if args.format == 'markdown': name = "data-highlights.hic." + studygroup + ".md" name = name.lower().replace(" ", "-") alias = "4dn-dcic-lab:" + name filetype = 'jsx' default_col_widths = "[-1,100,-1,100,-1,-1,-1,-1]" if "Study" not in keys: default_col_widths = "[-1,100,-1,120,250,-1,170]" output = md_table_maker(table_dsg, keys, name, default_col_widths) else: name = "data-highlights.hic." + studygroup name = name.lower().replace(" ", "-") alias = "4dn-dcic-lab:" + name filetype = 'html' styles = { 'Data Set': ";width:20%;min-width:120px", 'Replicate Sets': ";width:150px", 'Publication': ";width:200px" } output = html_table_maker(table_dsg, keys, styles) # check if static section exists post = False try: ff_utils.get_metadata(alias, auth) except Exception: print( "'{}' static section cannot be patched because it does not exist" .format(studygroup)) print("Do you want to (p)ost or (s)kip this static section? [p/s]") response = None while response not in ['p', 's']: response = input() if response == 's': skipped.append(alias) continue else: post = True # post or patch static section if post: post_body = { "name": name, "aliases": [alias], "body": output, "section_type": "Page Section", "title": studygroup, "options": { "collapsible": True, "default_open": True, "filetype": filetype } } if not dryrun: res = ff_utils.post_metadata(post_body, "StaticSection", key=auth) posted.append(alias) else: patch_body = {"body": output} if not dryrun: res = ff_utils.patch_metadata(patch_body, alias, key=auth) patched.append(alias) if not dryrun: print("{}: {}".format(alias, res['status'])) # summarize results print("Static sections summary: {} patched, {} posted, {} skipped".format( len(patched), len(posted), len(skipped))) if posted: print( "Remember to add the new static section(s) to the hic-data-overview page:" ) for item in posted: print(item) if skipped: print("Skipped sections:") for item in skipped: print(item)
def find_items_for_header_processing(connection, check, header, add_search=None, remove_search=None, append=True): """ (add_search) and remove them from others (remove_search). Args are: - connection (FS connection) - check (required; check object initialized by CheckResult) - headers @id (required) - add_search search query - remove_search search query Meant to be used for CHECKS """ # sets the full_output of the check! check.full_output = { 'static_section': header, 'to_add': {}, 'to_remove': {} } # this GET will fail if the static header does not exist header_res = ff_utils.get_metadata(header, key=connection.ff_keys) # add entries keyed by item uuid with value of the static headers if add_search: search_res_add = ff_utils.search_metadata(add_search, key=connection.ff_keys) for search_res in search_res_add: curr_headers = search_res.get('static_headers', []) # handle case where frame != object if curr_headers and isinstance(curr_headers[0], dict): curr_headers = [obj['@id'] for obj in curr_headers] if header not in curr_headers: curr_headers = curr_headers + [header] if append else [ header ] + curr_headers check.full_output['to_add'][search_res['@id']] = curr_headers if remove_search: search_res_remove = ff_utils.search_metadata(remove_search, key=connection.ff_keys) for search_res in search_res_remove: curr_headers = search_res.get('static_headers', []) # handle case where frame != object if curr_headers and isinstance(curr_headers[0], dict): curr_headers = [obj['@id'] for obj in curr_headers] if header in curr_headers: curr_headers.remove(header) check.full_output['to_remove'][ search_res['@id']] = curr_headers if check.full_output['to_add'] or check.full_output['to_remove']: check.status = 'WARN' check.summary = 'Ready to add and/or remove static header' check.description = 'Ready to add and/or remove static header: %s' % header check.allow_action = True check.action_message = 'Will add static header to %s items and remove it from %s items' % ( len(check.full_output['to_add']), len(check.full_output['to_remove'])) else: check.status = 'PASS' check.summary = 'Static header is all set'
def initialize_user_content(spawner): """ Used to initialize the users s3-backed notebook storage. For initialization, ensure all notebook templates are copied (check every time) In addition, load access keys from Fourfront and add them to the environment variables of the notebook. Also delete previously created access keys used for Jupyterhub for the user Also initialized a TrackingItem of type jupyterhub_session to track some basic information on the JH session """ err_output = [] # keep track of errors for debugging # grab this info fresh every time ff_keys = recompute_ff_keys(err_output) username = spawner.user.name # get the username list_res = s3_client.list_objects_v2( Bucket=os.environ['AWS_TEMPLATE_BUCKET']) # check each template individually for template_res in list_res.get('Contents', []): template_key = template_res['Key'] user_subdir = 'user-' + escape_string(username) notebook_temp_key = '/'.join([user_subdir, template_key]) source_info = { "Bucket": os.environ['AWS_TEMPLATE_BUCKET'], "Key": template_key } try: # always replace templates s3_client.copy_object(Bucket=os.environ["AWS_NOTEBOOK_BUCKET"], Key=notebook_temp_key, CopySource=source_info) except Exception as copy_exc: err_output.append({'copying_templates': str(copy_exc)}) # get the access keys and set them as environment variables for the user try: ff_user = ff_utils.get_metadata('/users/' + username, key=ff_keys) except Exception as user_exc: err_output.append({'getting_user': str(user_exc)}) clear_old_access_keys( ) # if we get here, old access key state must be cleared. else: key_descrip = 'jupyterhub_key' search_q = ''.join([ '/search/?type=AccessKey&status=current&description=', key_descrip, '&user.uuid=', ff_user['uuid'] ]) try: user_keys = ff_utils.search_metadata(search_q, key=ff_keys) except Exception as search_exc: err_output.append({'searching_keys': str(search_exc)}) else: for ukey in user_keys: try: ff_utils.patch_metadata({'status': 'deleted'}, ukey['uuid'], key=ff_keys) except Exception as patch_exc: err_output.append({'deleting_keys': str(patch_exc)}) # access key will be submitted by 4dn-dcic admin but belong to user key_body = {'user': ff_user['uuid'], 'description': key_descrip} try: key_res = ff_utils.post_metadata(key_body, 'access-keys', key=ff_keys) except Exception as key_exc: err_output.append({'post_key': str(key_exc)}) clear_old_access_keys( ) # if we get here, old access key state must be cleared. else: os.environ['FF_ACCESS_KEY'] = key_res['access_key_id'] os.environ['FF_ACCESS_SECRET'] = key_res['secret_access_key'] # intialize a tracking item for the session and store its uuid in env # set `submitted_by` manually to allow user to edit tracking_body = { 'jupyterhub_session': { 'date_initialized': datetime.datetime.utcnow().isoformat() + '+00:00', 'user_uuid': ff_user['uuid'] }, 'tracking_type': 'jupyterhub_session', 'submitted_by': ff_user['uuid'] } try: track_res = ff_utils.post_metadata(tracking_body, 'tracking-items', key=ff_keys) except Exception as track_exc: err_output.append({'tracking_item': str(track_exc)}) else: os.environ['FF_TRACKING_ID'] = track_res['@graph'][0]['uuid'] os.environ['INIT_ERR_OUTPUT'] = json.dumps(err_output)
def main(): """ Use this command to update the inserts from a given fourfront env """ logging.basicConfig() # Loading app will have configured from config file. Reconfigure here: logging.getLogger('encoded').setLevel(logging.DEBUG) parser = argparse.ArgumentParser( description="Update Inserts", epilog=EPILOG, formatter_class=argparse.RawDescriptionHelpFormatter, ) parser.add_argument('--env', default='data', help='FF environment to update from. Defaults to data') parser.add_argument('--dest', default='temp-local-inserts', help="destination file in inserts dir to write to") parser.add_argument('--item-type', action='append', default=[], help="item type, e.g. file_fastq. Defaults to all types") parser.add_argument('--ignore-field', action='append', default=[], help='field name to ignore when running expand_es_metadata') parser.add_argument('--from-search', help='query passed to search_metadata to find uuids') args = parser.parse_args() # this will work since bin/ commands are run from root FF directory inserts_location = 'src/encoded/tests/data' # hardcode these to prevent accidental creation of inserts files inserts_files = ['inserts', 'master-inserts', 'perf-testing', 'workbook-inserts', 'temp-local-inserts'] if args.dest not in inserts_files: raise Exception('Specified inserts destination %s must be one of: %s' % (args.dest, inserts_files)) inserts_path = '/'.join([inserts_location, args.dest]) local_inserts, item_uuids = read_local_inserts_dir(args.dest, inserts_path, args.item_type) # add uuids from the input search result, if present if args.from_search: use_search = args.from_search # get frame=object search results to keep response small if 'frame=' not in use_search: use_search += '&frame=object' search_res = search_metadata(use_search, ff_env=args.env) search_uuids = [item['uuid'] for item in search_res] logger.info('update_inserts: Will update using %s items from search' % len(search_uuids)) item_uuids = list(set(item_uuids + search_uuids)) # now find uuids and all linked from the given server svr_inserts, svr_uuids = expand_es_metadata(item_uuids, ff_env=args.env, store_frame='raw', add_pc_wfr=True, ignore_field=args.ignore_field) # if we are updating `inserts`, must make sure that items don't conflict # with those in `master-inserts` skip_uuids = set() if args.dest == 'inserts': master_path = '/'.join([inserts_location, 'master-inserts']) master_inserts, master_uuids = read_local_inserts_dir('master-inserts', master_path) item_conflict_report = {} for item_type in svr_inserts: itype_err = [] itype_okay = [] conflicting_items = [item for item in svr_inserts[item_type] if item['uuid'] in master_uuids] for conflict in conflicting_items: # compare inserts by loading json objects svr_json = json.dumps(conflict, sort_keys=True) mstr_json = json.dumps(master_inserts[item_type][conflict['uuid']], sort_keys=True) if svr_json != mstr_json: itype_err.append(conflict['uuid']) else: # the json is the same. Remove from the `inserts` update skip_uuids.add(conflict['uuid']) itype_okay.append(conflict['uuid']) item_conflict_report[item_type] = {'error': itype_err, 'okay': itype_okay} if any([it for it in item_conflict_report if item_conflict_report[it]['error']]): error_report = {it: item_conflict_report[it]['error'] for it in item_conflict_report} logger.error('update_inserts: Cannot update the following items in "inserts" directory,' ' since there are conflicting items with different values' 'in the master-inserts. Update those first. Conflicts:\n%s' % json.dumps(error_report, indent=4)) raise Exception('Cannot load inserts as there are conflicting items in `master-inserts`') elif any([it for it in item_conflict_report if item_conflict_report[it]['okay']]): conflict_report = {it: item_conflict_report[it]['okay'] for it in item_conflict_report} logger.warning('update_inserts: The following items are already in "master-inserts".' ' Will not add to "inserts". Items:\n%s' % json.dumps(conflict_report, indent=4)) # now we need to update the server inserts with contents from local inserts # so that existing information is not lost for item_type in svr_inserts: # remove items specified by skip uuids if skip_uuids: svr_inserts[item_type] = [insrt for insrt in svr_inserts[item_type] if insrt['uuid'] not in skip_uuids] for item_uuid in local_inserts.get(item_type, {}): if item_uuid not in svr_uuids and item_uuid not in skip_uuids: svr_inserts[item_type].append(local_inserts[item_type][item_uuid]) dump_results_to_json(svr_inserts, inserts_path) logger.info('update_inserts: Successfully wrote to %s' % inserts_path) for item_type in svr_inserts: logger.info('update_inserts: Wrote %s items to %s' % (len(svr_inserts[item_type]), item_type + '.json'))
def delete_wfrs(file_resp, my_key, delete=False, stash=None): # file_resp in embedded frame # stash: all related wfrs for file_resp deleted_wfrs = [] wfr_report = [] file_type = file_resp['@id'].split('/')[1] # special clause until we sort input_wfr_switch issue # do not delete output wfrs of control files output_wfrs = file_resp.get('workflow_run_outputs') if not output_wfrs: if file_type == 'files-processed': # user submtted processed files return else: # raw files: pass else: output_wfr = output_wfrs[0] wfr_type, time_info = output_wfr['display_title'].split(' run ') if wfr_type == 'encode-chipseq-aln-ctl 1.1.1': print('skipping control file for wfr check', file_resp['accession']) return wfr_uuids = [i['uuid'] for i in file_resp.get('workflow_run_inputs')] wfrs = [] if wfr_uuids: # fetch them from stash if stash: wfrs = [i for i in stash if i['uuid'] in wfr_uuids] assert len(wfrs) == len(wfr_uuids) # if no stash, get from database else: wfrs = [ i['embedded'] for i in ff_utils.get_es_metadata( wfr_uuids, sources=['embedded.*'], key=my_key) ] # look for md5s on files without wfr_run_output (file_microscopy ...) else: if file_type not in ['files-fastq', 'files-processed']: wfrs_url = ( '/search/?type=WorkflowRun&type=WorkflowRun&workflow.title=md5+0.2.6&workflow.title=md5+0.0.4' '&input_files.value.accession=') + file_resp['accession'] wfrs = ff_utils.search_metadata(wfrs_url, key=my_key) # Skip sbg and file provenance wfrs = [i for i in wfrs if not i['@id'].startswith('/workflow-runs-sbg/')] wfrs = [ i for i in wfrs if not i['display_title'].startswith('File Provenance Tracking') ] # CLEAN UP IF FILE IS DELETED if file_resp['status'] == 'deleted': if file_resp.get('quality_metric'): if delete: qc_uuid = file_resp['quality_metric']['uuid'] ff_utils.delete_field(file_resp, 'quality_metric', key=my_key) # delete quality metrics object patch_data = {'status': "deleted"} ff_utils.patch_metadata(patch_data, obj_id=qc_uuid, key=my_key) # delete all workflows for deleted files if not wfrs: return else: wfr_report = get_wfr_report(wfrs) for wfr_to_del in wfr_report: if wfr_to_del['status'] != 'deleted': if wfr_to_del['wfr_name'] not in workflow_names: print('Unlisted Workflow', wfr_to_del['wfr_name'], 'deleted file workflow', wfr_to_del['wfr_uuid'], file_resp['accession']) #################################################### # TEMPORARY PIECE################################## if wfr_to_del['status'] == 'released to project': print('saved from deletion', wfr_to_del['wfr_name'], 'deleted file workflow', wfr_to_del['wfr_uuid'], file_resp['accession']) return if wfr_to_del['status'] == 'released': print('delete released!!!!!', wfr_to_del['wfr_name'], 'deleted file workflow', wfr_to_del['wfr_uuid'], file_resp['accession']) return ##################################################### print(wfr_to_del['wfr_name'], 'deleted file workflow', wfr_to_del['wfr_uuid'], file_resp['accession']) if delete: patch_data = { 'description': "This workflow run is deleted", 'status': "deleted" } deleted_wfrs.append(wfr_to_del['wfr_uuid']) ff_utils.patch_metadata(patch_data, obj_id=wfr_to_del['wfr_uuid'], key=my_key) # delete output files of the deleted workflow run if wfr_to_del['outputs']: for out_file in wfr_to_del['outputs']: ff_utils.patch_metadata({'status': "deleted"}, obj_id=out_file, key=my_key) if wfr_to_del.get('qcs'): for out_qc in wfr_to_del['qcs']: ff_utils.patch_metadata({'status': "deleted"}, obj_id=out_qc, key=my_key) else: # get a report on all workflow_runs if not wfrs: return else: wfr_report = get_wfr_report(wfrs) # printTable(wfr_report, ['wfr_name', 'run_time', 'wfr_version', 'run_time', 'wfr_status']) # check if any unlisted wfr in report my_wfr_names = [i['wfr_name'] for i in wfr_report] unlisted = [x for x in my_wfr_names if x not in workflow_names] # report the unlisted ones if unlisted: print('Unlisted Workflow', unlisted, 'skipped in', file_resp['accession']) for wf_name, accepted_rev, accepted_run_time in workflow_details: # for each type of worklow make a list of old ones, and patch status and description sub_wfrs = [i for i in wfr_report if i['wfr_name'] == wf_name] if sub_wfrs: active_wfr = sub_wfrs[-1] old_wfrs = sub_wfrs[:-1] # check the status of the most recent workflow if active_wfr['wfr_status'] != 'complete': if (active_wfr['wfr_status'] in ['running', 'started'] and active_wfr['run_time'] < accepted_run_time): print(wf_name, 'still running for', file_resp['accession']) else: old_wfrs.append(active_wfr) elif active_wfr['wfr_version'] not in accepted_rev: old_wfrs.append(active_wfr) if old_wfrs: for wfr_to_del in old_wfrs: if wfr_to_del['status'] != 'deleted': if wfr_to_del['status'] in [ 'archived', 'replaced' ]: print(wfr_to_del['wfr_name'], wfr_to_del['status'], ' wfr found, skipping ', wfr_to_del['wfr_uuid'], file_resp['accession']) continue #################################################### # TEMPORARY PIECE if wfr_to_del[ 'status'] == 'released to project': print('saved from deletion', wfr_to_del['wfr_name'], 'old style or dub', wfr_to_del['wfr_uuid'], file_resp['accession']) continue if wfr_to_del['status'] == 'released': print('delete released????', wfr_to_del['wfr_name'], 'old style or dub', wfr_to_del['wfr_uuid'], file_resp['accession']) continue #################################################### print(wfr_to_del['wfr_name'], 'old style or dub', wfr_to_del['wfr_uuid'], file_resp['accession']) if delete: patch_data = { 'description': "This workflow run is deleted", 'status': "deleted" } deleted_wfrs.append(wfr_to_del['wfr_uuid']) ff_utils.patch_metadata( patch_data, obj_id=wfr_to_del['wfr_uuid'], key=my_key) # delete output files of the deleted workflow run if wfr_to_del['outputs']: for out_file in wfr_to_del['outputs']: ff_utils.patch_metadata( {'status': "deleted"}, obj_id=out_file, key=my_key) if wfr_to_del.get('qcs'): for out_qc in wfr_to_del['qcs']: ff_utils.patch_metadata( {'status': "deleted"}, obj_id=out_qc, key=my_key) return deleted_wfrs
def main(): """ Use this command to update the inserts from a given fourfront env """ logging.basicConfig() # Loading app will have configured from config file. Reconfigure here: logging.getLogger('encoded').setLevel(logging.DEBUG) parser = argparse.ArgumentParser( # noqa - PyCharm wrongly thinks the formatter_class is specified wrong here. description="Update Inserts", epilog=EPILOG, formatter_class=argparse.RawDescriptionHelpFormatter, ) parser.add_argument('--env', default='data', help='FF environment to update from. Defaults to data') parser.add_argument('--dest', default='temp-local-inserts', help="destination file in inserts dir to write to") parser.add_argument( '--item-type', action='append', default=[], help="item type, e.g. file_fastq. Defaults to all types") parser.add_argument( '--ignore-field', action='append', default=[ "submitted_by", "date_created", "last_modified", "schema_version" ], help='field name to ignore when running expand_es_metadata') parser.add_argument('--from-search', help='query passed to search_metadata to find uuids') args = parser.parse_args() # this will work since bin/ commands are run from root FF directory inserts_location = 'src/encoded/tests/data' # hardcode these to prevent accidental creation of inserts files inserts_files = [ 'inserts', 'master-inserts', 'perf-testing', 'workbook-inserts', 'temp-local-inserts' ] if args.dest not in inserts_files: raise Exception('Specified inserts destination %s must be one of: %s' % (args.dest, inserts_files)) inserts_path = '/'.join([inserts_location, args.dest]) local_inserts, item_uuids = read_local_inserts_dir(args.dest, inserts_path, args.item_type) # Used to preserve order of existing inserts in folder(s), if any. local_inserts_ordering_map = {} for item_type, local_inserts_for_type in local_inserts.items(): for insrt_index, insrt_uuid in enumerate(local_inserts_for_type): # Duplicate insrt_indx between different item types are OK and present. # local_inserts_ordering_map is shallow. local_inserts_ordering_map[insrt_uuid] = insrt_index # add uuids from the input search result, if present if args.from_search: use_search = args.from_search # get frame=object search results to keep response small if 'frame=' not in use_search: use_search += '&frame=object' search_res = search_metadata(use_search, ff_env=args.env) search_uuids = [item['uuid'] for item in search_res] logger.info('update_inserts: Will update using %s items from search' % len(search_uuids)) item_uuids = list(set(item_uuids + search_uuids)) # now find uuids and all linked from the given server svr_inserts, svr_uuids = expand_es_metadata(item_uuids, ff_env=args.env, store_frame='raw', add_pc_wfr=True, ignore_field=args.ignore_field) # if we are updating `inserts`, must make sure that items don't conflict # with those in `master-inserts` skip_uuids = set() if args.dest == 'inserts': master_path = '/'.join([inserts_location, 'master-inserts']) master_inserts, master_uuids = read_local_inserts_dir( 'master-inserts', master_path) item_conflict_report = {} for item_type in svr_inserts: itype_err = [] itype_okay = [] conflicting_items = [ item for item in svr_inserts[item_type] if item['uuid'] in master_uuids ] for conflict in conflicting_items: # compare inserts by loading json objects svr_json = json.dumps(conflict, sort_keys=True) mstr_json = json.dumps( master_inserts[item_type][conflict['uuid']], sort_keys=True) if svr_json != mstr_json: itype_err.append(conflict['uuid']) else: # the json is the same. Remove from the `inserts` update skip_uuids.add(conflict['uuid']) itype_okay.append(conflict['uuid']) item_conflict_report[item_type] = { 'error': itype_err, 'okay': itype_okay } if any([ it for it in item_conflict_report if item_conflict_report[it]['error'] ]): error_report = { it: item_conflict_report[it]['error'] for it in item_conflict_report } logger.error( 'update_inserts: Cannot update the following items in "inserts" directory,' ' since there are conflicting items with different values' 'in the master-inserts. Update those first. Conflicts:\n%s' % json.dumps(error_report, indent=4)) raise Exception( 'Cannot load inserts as there are conflicting items in `master-inserts`' ) elif any([ it for it in item_conflict_report if item_conflict_report[it]['okay'] ]): conflict_report = { it: item_conflict_report[it]['okay'] for it in item_conflict_report } logger.warning( 'update_inserts: The following items are already in "master-inserts".' ' Will not add to "inserts". Items:\n%s' % json.dumps(conflict_report, indent=4)) # now we need to update the server inserts with contents from local inserts # so that existing information is not lost for item_type in svr_inserts: if skip_uuids: # remove items specified by skip uuids svr_inserts[item_type] = [ insrt for insrt in svr_inserts[item_type] if insrt['uuid'] not in skip_uuids ] svr_inserts[item_type].sort( key=lambda insrt: local_inserts_ordering_map.get( insrt["uuid"], 99999)) for item_uuid in local_inserts.get(item_type, {}): if item_uuid not in svr_uuids and item_uuid not in skip_uuids: svr_inserts[item_type].append( local_inserts[item_type][item_uuid]) dump_results_to_json(svr_inserts, inserts_path) logger.info('update_inserts: Successfully wrote to %s' % inserts_path) for item_type in svr_inserts: logger.info('update_inserts: Wrote %s items to %s' % (len(svr_inserts[item_type]), item_type + '.json'))
def consistent_replicate_info(connection, **kwargs): ''' Check for replicate experiment sets that have discrepancies in metadata between replicate experiments. Action patches badges with a message detailing which fields have the inconsistencies and what the inconsistent values are. ''' check = CheckResult(connection, 'consistent_replicate_info') repset_url = 'search/?type=ExperimentSetReplicate&field=experiments_in_set.%40id&field=uuid&field=status&field=lab.display_title' exp_url = 'search/?type=Experiment&frame=object' bio_url = 'search/?type=Experiment&field=biosample' repsets = [item for item in ff_utils.search_metadata(repset_url, key=connection.ff_keys) if item.get('experiments_in_set')] exps = ff_utils.search_metadata(exp_url, key=connection.ff_keys) biosamples = ff_utils.search_metadata(bio_url, key=connection.ff_keys) exp_keys = {exp['@id']: exp for exp in exps} bio_keys = {bs['@id']: bs['biosample'] for bs in biosamples} fields2check = [ 'lab', 'award', 'experiment_type', 'crosslinking_method', 'crosslinking_time', 'crosslinking_temperature', 'digestion_enzyme', 'enzyme_lot_number', 'digestion_time', 'digestion_temperature', 'tagging_method', 'tagging_rounds', 'ligation_time', 'ligation_temperature', 'ligation_volume', 'biotin_removed', 'protocol', 'protocol_variation', 'follows_sop', 'average_fragment_size', 'fragment_size_range', 'fragmentation_method', 'fragment_size_selection_method', 'rna_tag', 'target_regions', 'dna_label', 'labeling_time', 'antibody', 'antibody_lot_id', 'microscopy_technique', 'imaging_paths', ] check.brief_output = {REV_KEY: {}, RELEASED_KEY: { 'Add badge': {}, 'Remove badge': {}, 'Keep badge and edit messages': {} }} compare = {} results = {} for repset in repsets: info_dict = {} exp_list = [item['@id'] for item in repset['experiments_in_set']] for field in fields2check: vals = [stringify(exp_keys[exp].get(field)) for exp in exp_list] if field == 'average_fragment_size' and 'None' not in vals: int_vals = [int(val) for val in vals] if (max(int_vals) - min(int_vals))/(sum(int_vals)/len(int_vals)) < 0.25: continue if len(set(vals)) > 1: info_dict[field] = vals for bfield in ['treatments_summary', 'modifications_summary']: bvals = [stringify(bio_keys[exp].get(bfield)) for exp in exp_list] if len(set(bvals)) > 1: info_dict[bfield] = bvals biosource_vals = [stringify([item['@id'] for item in bio_keys[exp].get('biosource')]) for exp in exp_list] if len(set(biosource_vals)) > 1: info_dict['biosource'] = biosource_vals if [True for exp in exp_list if bio_keys[exp].get('cell_culture_details')]: for ccfield in ['synchronization_stage', 'differentiation_stage', 'follows_sop']: ccvals = [stringify([item['@id'] for item in bio_keys[exp].get('cell_culture_details').get(ccfield)]) for exp in exp_list] if len(set(ccvals)) > 1: info_dict[ccfield] = ccvals if [True for exp in exp_list if bio_keys[exp].get('biosample_protocols')]: bp_vals = [stringify([item['@id'] for item in bio_keys[exp].get('biosample_protocols', [])]) for exp in exp_list] if len(set(bp_vals)) > 1: info_dict['biosample_protocols'] = bp_vals if info_dict: info = sorted(['{}: {}'.format(k, stringify(v)) for k, v in info_dict.items()]) #msg = 'Inconsistent replicate information in field(s) - ' + '; '.join(info) msgs = ['Inconsistent replicate information in ' + item for item in info] text = '{} - inconsistency in {}'.format(repset['@id'][-13:-1], ', '.join(list(info_dict.keys()))) lab = repset['lab']['display_title'] audit_key = REV_KEY if repset['status'] in REV else RELEASED_KEY results[repset['@id']] = {'status': audit_key, 'lab': lab, 'info': text} if audit_key == REV_KEY: if lab not in check.brief_output[audit_key]: check.brief_output[audit_key][lab] = [] check.brief_output[audit_key][lab].append(text) if repset['status'] not in REV: compare[repset['@id']] = msgs to_add, to_remove, to_edit, ok = compare_badges_and_messages( compare, 'ExperimentSetReplicate', 'inconsistent-replicate-info', connection.ff_keys ) key_dict = {'Add badge': to_add, 'Remove badge': to_remove, 'Keep badge and edit messages': to_edit} for result in results.keys(): for k, v in key_dict.items(): if result in v.keys(): if results[result]['lab'] not in check.brief_output[RELEASED_KEY][k].keys(): check.brief_output[RELEASED_KEY][k][results[result]['lab']] = [] check.brief_output[RELEASED_KEY][k][results[result]['lab']].append(results[result]['info']) break check.brief_output[RELEASED_KEY]['Remove badge'] = list(to_remove.keys()) if to_add or to_remove or to_edit: check.status = 'WARN' check.summary = 'Replicate Info badges need patching' check.description = ('{} ExperimentSetReplicates found that need a replicate-info badge patched' ''.format(len(to_add.keys()) + len(to_remove.keys()) + len(to_edit.keys()))) else: check.status = 'PASS' check.summary = 'Replicate Info badges are up-to-date' check.description = 'No ExperimentSetReplicates found that need a replicate-info badge patched' check.full_output = {'Add badge': to_add, 'Remove badge': to_remove, 'Keep badge and edit messages': to_edit, 'Keep badge (no change)': ok} check.action = 'patch_badges_for_inconsistent_replicate_info' if to_add or to_remove or to_edit: check.allow_action = True return check
def workflow_properties(connection, **kwargs): check = CheckResult(connection, 'workflow_properties') workflows = ff_utils.search_metadata( 'search/?type=Workflow&category!=provenance&frame=object', key=connection.ff_keys) bad = { 'Duplicate Input Names in Workflow Step': [], 'Duplicate Output Names in Workflow Step': [], 'Duplicate Input Source Names in Workflow Step': [], 'Duplicate Output Target Names in Workflow Step': [], 'Missing meta.file_format property in Workflow Step Input': [], 'Missing meta.file_format property in Workflow Step Output': [] } by_wf = {} for wf in workflows: # print(wf['@id']) issues = [] for step in wf.get('steps'): # no duplicates in input names step_inputs = step.get('inputs') for step_input in step_inputs: if (step_input['meta'].get('type') in ['data file', 'reference file'] and not step_input['meta'].get('file_format')): issues.append( 'Missing meta.file_format property in Workflow Step `{}` Input `{}`' ''.format(step.get('name'), step_input.get('name'))) input_names = [ step_input.get('name') for step_input in step_inputs ] if len(list(set(input_names))) != len(input_names): issues.append( 'Duplicate Input Names in Workflow Step {}'.format( step.get('name'))) # no duplicates in input source names sources = [(source.get('name'), source.get('step', "GLOBAL")) for step_input in step_inputs for source in step_input.get('source')] if len(sources) != len(list(set(sources))): issues.append( 'Duplicate Input Source Names in Workflow Step {}'.format( step.get('name'))) # no duplicates in output names step_outputs = step.get('outputs') for step_output in step_outputs: if (step_output['meta'].get('type') in ['data file', 'reference file'] and not step_output['meta'].get('file_format')): issues.append( 'Missing meta.file_format property in Workflow Step `{}` Output `{}`' ''.format(step.get('name'), step_output.get('name'))) output_names = [ step_output.get('name') for step_output in step_outputs ] if len(list(set(output_names))) != len(output_names): issues.append( 'Duplicate Output Names in Workflow Step {}'.format( step.get('name'))) # no duplicates in output target names targets = [(target.get('name'), target.get('step', 'GLOBAL')) for step_output in step_outputs for target in step_output.get('target')] if len(targets) != len(list(set(targets))): issues.append( 'Duplicate Output Target Names in Workflow Step {}'.format( step.get('name'))) if not issues: continue errors = ' '.join(issues) if 'Duplicate Input Names' in errors: bad['Duplicate Input Names in Workflow Step'].append(wf['@id']) if 'Duplicate Output Names' in errors: bad['Duplicate Output Names in Workflow Step'].append(wf['@id']) if 'Duplicate Input Source Names' in errors: bad['Duplicate Input Source Names in Workflow Step'].append( wf['@id']) if 'Duplicate Output Target Names' in errors: bad['Duplicate Output Target Names in Workflow Step'].append( wf['@id']) if '` Input `' in errors: bad['Missing meta.file_format property in Workflow Step Input'].append( wf['@id']) if '` Output `' in errors: bad['Missing meta.file_format property in Workflow Step Output'].append( wf['@id']) by_wf[wf['@id']] = issues if by_wf: check.status = 'WARN' check.summary = 'Workflows found with issues in `steps`' check.description = ( '{} workflows found with duplicate item names or missing fields' ' in `steps`'.format(len(by_wf.keys()))) else: check.status = 'PASS' check.summary = 'No workflows with issues in `steps` field' check.description = ( 'No workflows found with duplicate item names or missing fields' ' in steps property') check.brief_output = bad check.full_output = by_wf return check
def chipseq_status(connection, **kwargs): """ Keyword arguments: lab_title -- limit search with a lab i.e. Bing+Ren, UCSD start_date -- limit search to files generated since a date formatted YYYY-MM-DD run_time -- assume runs beyond run_time are dead """ start = datetime.utcnow() check = CheckResult(connection, 'chipseq_status') my_auth = connection.ff_keys check.action = "chipseq_start" check.description = "run missing steps and add processing results to processed files, match set status" check.brief_output = [] check.summary = "" check.full_output = {'skipped': [], 'running_runs': [], 'needs_runs': [], 'completed_runs': [], 'problematic_runs': []} check.status = 'PASS' exp_type = 'ChIP-seq' # completion tag tag = wfr_utils.accepted_versions[exp_type][-1] # check indexing queue check, skip = wfr_utils.check_indexing(check, connection) if skip: return check # Build the query, add date and lab if available query = wfr_utils.build_exp_type_query(exp_type, kwargs) res = ff_utils.search_metadata(query, key=my_auth) print(len(res)) if not res: check.summary = 'All Good!' return check # run step 0 on all experiments with more than 2 sets of files # for control sets, run step1c on each experiment and finish # for non-control sets, run step1 on each experiment, check if control is ready, run step2 on set step0_name = 'merge-fastq' step1_name = 'encode-chipseq-aln-chip' step1c_name = 'encode-chipseq-aln-ctl' step2_name = 'encode-chipseq-postaln' for a_set in res: set_acc = a_set['accession'] all_items, all_uuids = ff_utils.expand_es_metadata([a_set['uuid']], my_auth, store_frame='embedded', add_pc_wfr=True, ignore_field=[ # 'experiment_relation', 'biosample_relation', 'references', 'reference_pubs']) now = datetime.utcnow() print(a_set['accession'], (now-start).seconds, len(all_uuids)) if (now-start).seconds > lambda_limit: break # are all files uploaded ? all_uploaded = True for a_file in all_items['file_fastq']: if a_file['status'] in ['uploading', 'upload failed']: all_uploaded = False if not all_uploaded: final_status = a_set['accession'] + ' skipped, waiting for file upload' print(final_status) check.brief_output.append(final_status) check.full_output['skipped'].append({a_set['accession']: 'files status uploading'}) continue all_wfrs = all_items.get('workflow_run_awsem', []) + all_items.get('workflow_run_sbg', []) all_files = [i for typ in all_items for i in all_items[typ] if typ.startswith('file_')] all_qcs = [i for typ in all_items for i in all_items[typ] if typ.startswith('quality_metric')] library = {'wfrs': all_wfrs, 'files': all_files, 'qcs': all_qcs} keep = {'missing_run': [], 'running': [], 'problematic_run': []} # if all completed, patch this info complete = {'patch_opf': [], 'add_tag': []} set_acc = a_set['accession'] # some feature to extract from each set control = "" # True or False (True if set is control) control_set = "" # None if there are no control experiments or if the set is control target_type = "" # Histone or TF (or None for control) paired = "" # single or paired , checked for each experiment organism = "" replicate_exps = a_set['replicate_exps'] replicate_exps = sorted(replicate_exps, key=lambda x: [x['bio_rep_no'], x['tec_rep_no']]) # get organism, target and control from the first replicate f_exp = replicate_exps[0]['replicate_exp']['uuid'] # have to do another get for control experiments if there is one f_exp_resp = [i for i in all_items['experiment_seq'] if i['uuid'] == f_exp][0] control, control_set, target_type, organism = wfr_utils.get_chip_info(f_exp_resp, all_items) print('ORG:', organism, "CONT:", control, "TARGET:", target_type, "CONT_SET:", control_set) set_summary = " - ".join([set_acc, str(organism), str(target_type), str(control)]) # sanity checks # if control and also has an AB with target if control and target_type: set_summary += "| error - has target and is control" check.brief_output.append(set_summary) check.full_output['skipped'].append({set_acc: set_summary}) continue # can only process mouse and human at the moment if organism not in ['mouse', 'human']: set_summary += "| organism not ready for chip" check.brief_output.append(set_summary) check.full_output['skipped'].append({set_acc: set_summary}) continue # if not control, we need a target if not control and not target_type: set_summary += "| missing target type" check.brief_output.append(set_summary) check.full_output['skipped'].append({set_acc: set_summary}) continue # collect results from step1 runs for step2 ta = [] taxcor = [] ta_cnt = [] # track if all experiments completed step0 and step1 ready_for_step2 = True for an_exp in replicate_exps: # track if all experiments completed step0 ready_for_step1 = True # track if all control experiments are completed processing control_ready = True exp_id = an_exp['replicate_exp']['accession'] exp_resp = [i for i in all_items['experiment_seq'] if i['accession'] == exp_id][0] exp_files, paired = wfr_utils.get_chip_files(exp_resp, all_files) # if there are more then 2 files, we need to merge: print(exp_id, len(exp_files), paired) # if too many input, merge them if len(exp_files) > 2: # exp_files format [[pair1,pair2], [pair1, pair2]] @id input_list = [] if paired == 'paired': # first add paired end 1s input_list.append([i[0] for i in exp_files]) input_list.append([i[1] for i in exp_files]) elif paired == 'single': input_list.append([i[0] for i in exp_files]) # collect files for step1 and step1c merged_files = [] step0_status = 'complete' merge_enum = 0 # if paired, need to run merge twice for each end for merge_case in input_list: merge_enum += 1 # RUN STEP 0 s0_input_files = {'input_fastqs': merge_case} s0_tag = exp_id + '_p' + str(merge_enum) keep, step0_status, step0_output = wfr_utils.stepper(library, keep, 'step0', s0_tag, merge_case, s0_input_files, step0_name, 'merged_fastq', organism=organism) if step0_status == 'complete': merged_files.append(step0_output) else: ready_for_step1 = False if ready_for_step1: # rewrite exp_files with merged ones exp_files = [[]] for a_merged in merged_files: exp_files[0].append(a_merged) # if step0 was not complete, skip checks for step2 if not ready_for_step1: ready_for_step2 = False continue # step1 references: input_files = {} if organism == 'human': org = 'hs' input_files['chip.bwa_idx_tar'] = '/files-reference/4DNFIZQB369V/' input_files['chip.blacklist'] = '/files-reference/4DNFIZ1TGJZR/' input_files['chip.chrsz'] = '/files-reference/4DNFIZJB62D1/' input_files['additional_file_parameters'] = {"chip.bwa_idx_tar": {"rename": "GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.tar"}} if organism == 'mouse': org = 'mm' input_files['chip.bwa_idx_tar'] = '/files-reference/4DNFIZ2PWCC2/' input_files['chip.blacklist'] = '/files-reference/4DNFIZ3FBPK8/' input_files['chip.chrsz'] = '/files-reference/4DNFIBP173GC/' input_files['additional_file_parameters'] = {"chip.bwa_idx_tar": {"rename": "mm10_no_alt_analysis_set_ENCODE.fasta.tar"}} # step1 Parameters parameters = {} parameters["chip.gensz"] = org if paired == 'single': frag_temp = [300] fraglist = frag_temp * len(exp_files) parameters['chip.fraglen'] = fraglist parameters['chip.paired_end'] = False elif paired == 'paired': parameters['chip.paired_end'] = True # run step1 for control if control: # control run on tf mode # input_files = {'chip.ctl_fastqs': [exp_files]} input_files['chip.ctl_fastqs'] = [exp_files] control_parameters = { "chip.pipeline_type": 'tf', "chip.choose_ctl.always_use_pooled_ctl": True, "chip.bam2ta_ctl.regex_grep_v_ta": "chr[MUE]|random|alt", "chip.bwa_ctl.cpu": 8, "chip.merge_fastq_ctl.cpu": 8, "chip.filter_ctl.cpu": 8, "chip.bam2ta_ctl.cpu": 8, "chip.align_only": True } parameters.update(control_parameters) s1c_input_files = input_files s1c_tag = exp_id keep, step1c_status, step1c_output = wfr_utils.stepper(library, keep, 'step1c', s1c_tag, exp_files, s1c_input_files, step1c_name, 'chip.first_ta_ctl', additional_input={'parameters': parameters}, organism=organism) if step1c_status == 'complete': # accumulate files to patch on experiment patch_data = [step1c_output, ] complete['patch_opf'].append([exp_id, patch_data]) else: # don't patch anything if at least one exp is still missing ready_for_step2 = False print('step1c') print(step1c_status, step1c_output) # run step1 else: # input_files = {'chip.fastqs': [exp_files]} input_files['chip.fastqs'] = [exp_files] exp_parameters = { "chip.pipeline_type": target_type, "chip.choose_ctl.always_use_pooled_ctl": True, "chip.bam2ta.regex_grep_v_ta": "chr[MUE]|random|alt", "chip.bwa.cpu": 8, "chip.merge_fastq.cpu": 8, "chip.filter.cpu": 8, "chip.bam2ta.cpu": 8, "chip.xcor.cpu": 8, "chip.align_only": True } parameters.update(exp_parameters) s1_input_files = input_files s1_tag = exp_id # if complete, step1_output will have a list of 2 files, first_ta, and fist_ta_xcor keep, step1_status, step1_output = wfr_utils.stepper(library, keep, 'step1', s1_tag, exp_files, s1_input_files, step1_name, ['chip.first_ta', 'chip.first_ta_xcor'], additional_input={'parameters': parameters}, organism=organism) if step1_status == 'complete': exp_ta_file = step1_output[0] exp_taxcor_file = step1_output[1] # accumulate files to patch on experiment patch_data = [exp_ta_file, ] complete['patch_opf'].append([exp_id, patch_data]) ta.append(exp_ta_file) taxcor.append(exp_taxcor_file) # find the control file if there is a control set found if control_set: try: exp_cnt_ids = [i['experiment'] for i in exp_resp['experiment_relation'] if i['relationship_type'] == 'controlled by'] exp_cnt_ids = [i['@id'] for i in exp_cnt_ids] except: control_ready = False print('Control Relation has problems for this exp', exp_id) continue if len(exp_cnt_ids) != 1: control_ready = False print('Multiple controls for this exp', exp_id) continue exp_cnt_id = exp_cnt_ids[0] print('controled by set', exp_cnt_id) # have to do a get for the control experiment exp_cnt_resp = [i for i in all_items['experiment_seq'] if i['@id'] == exp_cnt_id][0] cont_file = '' # check opf for control file for opf_case in exp_cnt_resp.get('other_processed_files', []): if opf_case['title'] == 'ENCODE ChIP-Seq Pipeline - Preliminary Files': opf_files = opf_case['files'] assert len(opf_files) == 1 cont_file = opf_files[0]['@id'] # if not in opf, check processed files if not cont_file: pf_list = exp_cnt_resp.get('processed_files', []) if pf_list: if pf_list: assert len(pf_list) == 1 cont_file = pf_list[0]['@id'] # did we find it, if so, add it to ta_cnt if cont_file: ta_cnt.append(cont_file) else: control_ready = False else: # don't patch anything if at least one exp is still missing ready_for_step2 = False print('step1') print(step1_status, step1_output, control_ready) # back to set level final_status = set_acc # start the reporting with acc all_completed = False # is step0 step1 complete if ready_for_step2 and not control_ready: final_status += ' waiting for control experiments to finish processing' elif ready_for_step2: # for control, add tag to set, and files to experiments if control: complete['add_tag'] = [set_acc, tag] # for non controls check for step2 else: # this only works with 2 experiments, if 3, pick best 2, if more, skip for now if len(ta) > 3: set_summary += "| skipped - more then 3 experiments in set, can not process at the moment" check.brief_output.append(set_summary) check.full_output['skipped'].append({set_acc: set_summary}) continue if len(ta) > 2: ta_2 = [] taxcor_2 = [] print('ExperimentSet has 3 experiments, selecting best 2') ta_2 = wfr_utils.select_best_2(ta, all_files, all_qcs) # xcor does not have qc, use ta indexes to find the correct files for ta_f in ta_2: taxcor_2.append(taxcor[ta.index(ta_f)]) ta = ta_2 taxcor = taxcor_2 # for control files ,also select best2 ta_cnt = wfr_utils.select_best_2(ta_cnt, all_files, all_qcs) # collect step2 input files s2_input_files = {} if organism == 'human': org = 'hs' s2_input_files['chip.blacklist'] = '/files-reference/4DNFIZ1TGJZR/' s2_input_files['chip.chrsz'] = '/files-reference/4DNFIZJB62D1/' if organism == 'mouse': org = 'mm' s2_input_files['chip.blacklist'] = '/files-reference/4DNFIZ3FBPK8/' s2_input_files['chip.chrsz'] = '/files-reference/4DNFIBP173GC/' def rename_chip(input_at_id_list): # rename bed.gz to tagAlign.gz renamed = [] for a_file in input_at_id_list: acc = a_file.split('/')[2] renamed.append(acc + '.tagAlign.gz') return renamed s2_input_files['additional_file_parameters'] = {} s2_input_files['chip.tas'] = ta s2_input_files['additional_file_parameters']['chip.tas'] = {"rename": rename_chip(ta)} s2_input_files['chip.bam2ta_no_filt_R1.ta'] = taxcor s2_input_files['additional_file_parameters']['chip.bam2ta_no_filt_R1.ta'] = {"rename": rename_chip(taxcor)} if ta_cnt: s2_input_files['chip.ctl_tas'] = ta_cnt s2_input_files['additional_file_parameters']['chip.ctl_tas'] = {"rename": rename_chip(ta_cnt)} # collect parameters parameters = {} if paired == 'single': chip_p = False elif paired == 'paired': chip_p = True if not control_set: if target_type == 'histone': set_summary += "| skipped - histone without control needs attention, ie change to tf" check.brief_output.append(set_summary) check.full_output['skipped'].append({set_acc: set_summary}) continue run_ids = {'desc': set_acc + a_set.get('description', '')} parameters = { "chip.pipeline_type": target_type, "chip.paired_end": chip_p, "chip.choose_ctl.always_use_pooled_ctl": True, "chip.qc_report.desc": run_ids['desc'], "chip.gensz": org, "chip.xcor.cpu": 4, } if paired == 'single': frag_temp = [300] fraglist = frag_temp * len(ta) parameters['chip.fraglen'] = fraglist # if the target is a tf and there is no control, use macs2 if not control_set: if target_type == 'tf': parameters['chip.peak_caller'] = "macs2" s2_tag = set_acc # if complete, step1_output will have a list of 2 files, first_ta, and fist_ta_xcor keep, step2_status, step2_output = wfr_utils.stepper(library, keep, 'step2', s2_tag, ta, s2_input_files, step2_name, ['chip.optimal_peak', 'chip.conservative_peak', 'chip.sig_fc'], additional_input={'parameters': parameters}, organism=organism) if step2_status == 'complete': set_opt_peak = step2_output[0] set_cons_peak = step2_output[1] set_sig_fc = step2_output[2] # accumulate files to patch on experiment patch_data = [set_opt_peak, set_cons_peak, set_sig_fc] complete['patch_opf'].append([set_acc, patch_data]) complete['add_tag'] = [set_acc, tag] all_completed = True # unpack results missing_run = keep['missing_run'] running = keep['running'] problematic_run = keep['problematic_run'] if all_completed: final_status += ' completed' else: if missing_run: final_status += ' |Missing: ' + " ".join([i[0] for i in missing_run]) if running: final_status += ' |Running: ' + " ".join([i[0] for i in running]) if problematic_run: final_status += ' |Problem: ' + " ".join([i[0] for i in problematic_run]) # add dictionaries to main ones check.brief_output.append(final_status) print(final_status) if running: check.full_output['running_runs'].append({set_acc: running}) if missing_run: check.full_output['needs_runs'].append({set_acc: missing_run}) if problematic_run: check.full_output['problematic_runs'].append({set_acc: problematic_run}) # if made it till the end if complete.get('add_tag'): assert not running assert not problematic_run assert not missing_run check.full_output['completed_runs'].append(complete) # complete check values check.summary = "" if check.full_output['running_runs']: check.summary = str(len(check.full_output['running_runs'])) + ' running|' if check.full_output['skipped']: check.summary += str(len(check.full_output['skipped'])) + ' skipped|' check.status = 'WARN' if check.full_output['needs_runs']: check.summary += str(len(check.full_output['needs_runs'])) + ' missing|' check.status = 'WARN' check.allow_action = True if check.full_output['completed_runs']: check.summary += str(len(check.full_output['completed_runs'])) + ' completed|' check.status = 'WARN' check.allow_action = True if check.full_output['problematic_runs']: check.summary += str(len(check.full_output['problematic_runs'])) + ' problem|' check.status = 'WARN' return check
data_dir = os.environ['DATA_VOLUME_CONTAINER'] c.JupyterHub.cookie_secret_file = os.path.join(data_dir, 'jupyterhub_cookie_secret') c.JupyterHub.db_url = os.path.join(data_dir, 'jupyterhub.sqlite') # Whitlelist users and admins c.Authenticator.whitelist = whitelist = set() c.Authenticator.admin_users = admin = set() c.JupyterHub.admin_access = True # comma-separated admin emails, lowercased admin_emails = [ email.strip().lower() for email in os.environ.get('ADMIN_EMAILS', '').split(',') ] ff_users = ff_utils.search_metadata('search/?type=User&field=email', key=ff_keys) for ff_user in ff_users: if not ff_user.get('email'): continue whitelist.add(ff_user['email']) # base admin off of a set environment variable, for now if ff_user['email'].lower() in admin_emails: admin.add(ff_user['email']) # add API token to the instance. Use the **only** the first admin email if admin_emails: c.JupyterHub.api_tokens = { jh_token['secret']: admin_emails[0], } # set up services
def yellow_flag_biosamples(connection, **kwargs): ''' Checks biosamples for required metadata: 1. Culture harvest date, doubling number, passage number, culture duration 2. Morphology image 3. Karyotyping (authentication doc or string field) for any biosample derived from pluripotent cell line that has been passaged more than 10 times beyond the first thaw of the original vial. 4. Differentiation authentication for differentiated cells. 5. HAP-1 biosamples must have ploidy authentication. 6. For phase 2 samples must include FBS info (post 2022-05-10) ''' check = CheckResult(connection, 'yellow_flag_biosamples') results = ff_utils.search_metadata('search/?type=Biosample', key=connection.ff_keys) flagged = {} check.brief_output = {RELEASED_KEY: {}, REV_KEY: []} fbs_chk_date = '2022-05-10' for result in results: messages = [] bs_types = [bs.get('biosource_type') for bs in result.get('biosource', [])] karyotype = False diff_auth = False ploidy = False bccs = result.get('cell_culture_details', []) if not bccs: if len([t for t in bs_types if t in ['primary cell', 'tissue', 'multicellular organism']]) != len(bs_types): messages.append('Biosample missing Cell Culture Details') else: tier = re.search(r'\(Tier (1|2)\)', result.get('biosource_summary')) for bcc in bccs: for item in [ 'culture_harvest_date', 'doubling_number', 'passage_number', 'culture_duration', 'morphology_image' ]: if not bcc.get(item): messages.append('Biosample missing {}'.format(item)) if bcc.get('karyotype'): karyotype = True for protocol in bcc.get('authentication_protocols', []): protocol_item = ff_utils.get_metadata(protocol['@id'], key=connection.ff_keys) auth_type = protocol_item.get('protocol_classification') if not karyotype and auth_type == 'Karyotype Authentication': karyotype = True elif auth_type == 'Differentiation Authentication': diff_auth = True elif auth_type == 'Ploidy Authentication': ploidy = True passages = bcc.get('passage_number', 0) if 'tem cell' in ''.join(bs_types) and not karyotype: if passages > 10: messages.append('Biosample is a stem cell line over 10 passages but missing karyotype') elif not passages: messages.append('Biosample is a stem cell line with unknown passage number missing karyotype') if tier and bcc.get('culture_start_date', '2000-01-01') > fbs_chk_date: valid_fbs = ["VWR 97068-091 Lot 035B15 (phase 1)", "Peak Serum PS-FBS2 Lot 21E1202 (phase 2)", "VWR 89510-184 lot 310B19 (phase 2)"] fbs_info = bcc.get('fbs_vendor_lot', '').strip() if fbs_info not in valid_fbs: messages.append('Tiered cell line cultured after {} missing 4DN specified FBS vendor and lot info'.format(fbs_chk_date)) if result.get('biosample_type') == 'In vitro differentiated cells' and not diff_auth: messages.append('Differentiated biosample missing differentiation authentication') if 'HAP-1' in result.get('biosource_summary') and not ploidy: messages.append('HAP-1 biosample missing ploidy authentication') if messages: messages = [messages[i] for i in range(len(messages)) if messages[i] not in messages[:i]] if result.get('status') in REV: check.brief_output[REV_KEY].append('{} missing {}'.format( result['@id'], ', '.join(list(set([item[item.index('missing') + 8:] for item in messages]))) )) else: flagged[result['@id']] = messages to_add, to_remove, to_edit, ok = compare_badges_and_messages( flagged, 'Biosample', 'biosample-metadata-incomplete', connection.ff_keys ) check.action = 'patch_biosample_warning_badges' if to_add or to_remove or to_edit: check.status = 'WARN' check.summary = 'Yellow flag biosample badges need patching' check.description = '{} biosamples need warning badges patched'.format( len(to_add.values()) + len(to_remove.values()) + len(to_edit.values()) ) check.allow_action = True else: check.status = 'PASS' check.summary = 'Yellow flag biosample badges up-to-date' check.description = 'No yellow flag biosample badges need patching' check.full_output = {'Add badge': to_add, 'Remove badge': to_remove, 'Keep badge and edit messages': to_edit, 'Keep badge (no change)': ok} check.brief_output[RELEASED_KEY] = { 'Add badge': ['{} missing {}'.format( k, ', '.join([item[item.index('missing') + 8:] for item in flagged[k]]) ) for k in to_add.keys()], 'Remove badge': list(to_remove.keys()), 'Keep badge and edit messages': ['{} missing {}'.format( k, ', '.join([item[item.index('missing') + 8:] for item in flagged[k]]) ) for k in to_edit.keys()] } return check
def atacseq_status(connection, **kwargs): """ Keyword arguments: lab_title -- limit search with a lab i.e. Bing+Ren, UCSD start_date -- limit search to files generated since a date formatted YYYY-MM-DD run_time -- assume runs beyond run_time are dead pick_best_2 -- False by default. If set the True, for sets more than 2 experiments, 2 best will be used instead of running mergebed """ start = datetime.utcnow() check = CheckResult(connection, 'atacseq_status') my_auth = connection.ff_keys check.action = "atacseq_start" check.description = "run missing steps and add processing results to processed files, match set status" check.brief_output = [] check.summary = "" check.full_output = {'skipped': [], 'running_runs': [], 'needs_runs': [], 'completed_runs': [], 'problematic_runs': []} check.status = 'PASS' exp_type = 'ATAC-seq' # completion tag tag = wfr_utils.accepted_versions[exp_type][-1] pick_best_2 = kwargs.get('pick_best_2', False) # check indexing queue check, skip = wfr_utils.check_indexing(check, connection) if skip: return check # Build the query, add date and lab if available query = wfr_utils.build_exp_type_query(exp_type, kwargs) res = ff_utils.search_metadata(query, key=my_auth) print(len(res)) if not res: check.summary = 'All Good!' return check # run step 0 on all experiments with more than 2 sets of files # step1 on each experiment,if multiple exps, merge beds, run step3 on set step0_name = 'merge-fastq' step1_name = 'encode-atacseq-aln' step2_name = 'mergebed' step3_name = 'encode-atacseq-postaln' for a_set in res: set_acc = a_set['accession'] all_items, all_uuids = ff_utils.expand_es_metadata([a_set['uuid']], my_auth, store_frame='embedded', add_pc_wfr=True, ignore_field=['experiment_relation', 'biosample_relation', 'references', 'reference_pubs']) now = datetime.utcnow() print(a_set['accession'], (now-start).seconds, len(all_uuids)) if (now-start).seconds > lambda_limit: break # are all files uploaded ? all_uploaded = True for a_file in all_items['file_fastq']: if a_file['status'] in ['uploading', 'upload failed']: all_uploaded = False if not all_uploaded: final_status = a_set['accession'] + ' skipped, waiting for file upload' print(final_status) check.brief_output.append(final_status) check.full_output['skipped'].append({a_set['accession']: 'files status uploading'}) continue all_wfrs = all_items.get('workflow_run_awsem', []) + all_items.get('workflow_run_sbg', []) all_files = [i for typ in all_items for i in all_items[typ] if typ.startswith('file_')] all_qcs = [i for typ in all_items for i in all_items[typ] if typ.startswith('quality_metric')] library = {'wfrs': all_wfrs, 'files': all_files, 'qcs': all_qcs} keep = {'missing_run': [], 'running': [], 'problematic_run': []} # if all completed, patch this info complete = {'patch_opf': [], 'add_tag': []} set_acc = a_set['accession'] # some feature to extract from each set paired = "" # single or paired , checked for each experiment organism = "" replicate_exps = a_set['replicate_exps'] replicate_exps = sorted(replicate_exps, key=lambda x: [x['bio_rep_no'], x['tec_rep_no']]) # get organism f_exp = replicate_exps[0]['replicate_exp']['uuid'] # have to do another get for control experiments if there is one f_exp_resp = [i for i in all_items['experiment_atacseq'] if i['uuid'] == f_exp][0] biosample = f_exp_resp['biosample'] organism = list(set([bs['organism']['name'] for bs in biosample['biosource']]))[0] set_summary = " - ".join([set_acc, str(organism)]) print(set_summary) # sanity checks # can only process mouse and human at the moment if organism not in ['mouse', 'human']: set_summary += "| organism not ready for atac" check.brief_output.append(set_summary) check.full_output['skipped'].append({set_acc: set_summary}) continue # collect results from step1 runs for step2 ta = [] # track if all experiments completed step0 and step1 ready_for_step2 = True for an_exp in replicate_exps: # track if all experiments completed step0 ready_for_step1 = True exp_id = an_exp['replicate_exp']['accession'] exp_resp = [i for i in all_items['experiment_atacseq'] if i['accession'] == exp_id][0] # exp_files [[pair1,pair2], [pair1, pair2]] exp_files, paired = wfr_utils.get_chip_files(exp_resp, all_files) # if there are more then 2 files, we need to merge: print(exp_id, len(exp_files), paired) # if too many input, merge them if len(exp_files) > 2: # exp_files format [[pair1,pair2], [pair1, pair2]] @id input_list = [] if paired == 'paired': # first add paired end 1s input_list.append([i[0] for i in exp_files]) input_list.append([i[1] for i in exp_files]) elif paired == 'single': input_list.append([i[0] for i in exp_files]) # collect files for step1 and step1c merged_files = [] step0_status = 'complete' merge_enum = 0 # if paired, need to run merge twice for each end for merge_case in input_list: merge_enum += 1 # RUN STEP 0 s0_input_files = {'input_fastqs': merge_case} s0_tag = exp_id + '_p' + str(merge_enum) keep, step0_status, step0_output = wfr_utils.stepper(library, keep, 'step0', s0_tag, merge_case, s0_input_files, step0_name, 'merged_fastq') if step0_status == 'complete': merged_files.append(step0_output) else: ready_for_step1 = False if ready_for_step1: # rewrite exp_files with merged ones exp_files = [[]] for a_merged in merged_files: exp_files[0].append(a_merged) # if step0 was not complete, skip checks for step2 if not ready_for_step1: ready_for_step2 = False continue # step1 files # references input_files = {} if organism == 'human': org = 'hs' input_files['atac.bowtie2_idx_tar'] = '/files-reference/4DNFIMQPTYDY/' input_files['atac.blacklist'] = '/files-reference/4DNFIZ1TGJZR/' input_files['atac.chrsz'] = '/files-reference/4DNFIZJB62D1/' input_files['additional_file_parameters'] = {"atac.bowtie2_idx_tar": {"rename": "GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.tar"}} if organism == 'mouse': org = 'mm' input_files['atac.bowtie2_idx_tar'] = '/files-reference/4DNFI2493SDN/' input_files['atac.blacklist'] = '/files-reference/4DNFIZ3FBPK8/' input_files['atac.chrsz'] = '/files-reference/4DNFIBP173GC/' input_files['additional_file_parameters'] = {"atac.bowtie2_idx_tar": {"rename": "mm10_no_alt_analysis_set_ENCODE.fasta.tar"}} # add input files input_files['atac.fastqs'] = [exp_files] # step1 Parameters parameters = { "atac.pipeline_type": 'atac', "atac.gensz": org, "atac.bam2ta.regex_grep_v_ta": "chr[MUE]|random|alt", "atac.disable_ataqc": True, "atac.enable_xcor": False, "atac.trim_adapter.auto_detect_adapter": True, "atac.bowtie2.cpu": 4, "atac.filter.cpu": 4, "atac.bam2ta.cpu": 4, "atac.trim_adapter.cpu": 4, "atac.align_only": True } if paired == 'single': frag_temp = [300] fraglist = frag_temp * len(exp_files) parameters['atac.fraglen'] = fraglist parameters['atac.paired_end'] = False elif paired == 'paired': parameters['atac.paired_end'] = True s1_input_files = input_files s1_tag = exp_id # if complete, step1_output will have a list of 2 files, first_ta, and fist_ta_xcor keep, step1_status, step1_output = wfr_utils.stepper(library, keep, 'step1', s1_tag, exp_files, s1_input_files, step1_name, 'atac.first_ta', additional_input={'parameters': parameters}) if step1_status == 'complete': # accumulate files to patch on experiment patch_data = [step1_output, ] complete['patch_opf'].append([exp_id, patch_data]) ta.append(step1_output) else: # don't patch anything if at least one exp is still missing ready_for_step2 = False print('step1', step1_status, step1_output) # back to set level final_status = set_acc # start the reporting with acc all_completed = False # is step0 step1 complete if ready_for_step2: # Following was the proposed logic, but it is not implemented # Currently, for sets with more than 2 experiments, there are 2 options # 1) pick best 2, 2) run mergebed (default) # Proposed logic # if there are more then 2 experiments, check the number of biological replicates # if there is 1 Biological Replicate # -pick best 2 exp # if there are 2 Biological replicates # - run mergebed on bioreps with more then 1 technical replicate # if there are 3 Biological replicates # - if there are 3 total experiments (1 in each biological rep), pick best 2 # - else, run mergebed on bioreps with more then 1 technical replicate, and pick best 2 biorep # if there are 4 or more Biolofical replicates # - run mergebed on bioreps with more then 1 technical replicate, and pick best 2 biorep # this only works with 2 experiments, if 3, pick best 2, if more, skip for now ready_for_step3 = True if len(ta) > 2: if pick_best_2: # pick best 2 - False by default print('ExperimentSet has 3 experiments, selecting best 2') ta = wfr_utils.select_best_2(ta, all_files, all_qcs) else: # run mergebed - default option s2_input_files = {'input_bed': ta} s2_tag = set_acc # if complete, step1_output will have a list of 2 files, first_ta, and fist_ta_xcor keep, step2_status, step2_output = wfr_utils.stepper(library, keep, 'step2', s2_tag, ta, s2_input_files, step2_name, 'merged_bed') if step2_status == 'complete': ta = [step2_output, ] else: ready_for_step3 = False if ready_for_step3: # collect step3 input files s3_input_files = {} if organism == 'human': org = 'hs' s3_input_files['atac.blacklist'] = '/files-reference/4DNFIZ1TGJZR/' s3_input_files['atac.chrsz'] = '/files-reference/4DNFIZJB62D1/' if organism == 'mouse': org = 'mm' s3_input_files['atac.blacklist'] = '/files-reference/4DNFIZ3FBPK8/' s3_input_files['atac.chrsz'] = '/files-reference/4DNFIBP173GC/' def rename_chip(input_at_id_list): # rename bed.gz to tagAlign.gz renamed = [] for a_file in input_at_id_list: acc = a_file.split('/')[2] renamed.append(acc + '.tagAlign.gz') return renamed s3_input_files['additional_file_parameters'] = {} s3_input_files['atac.tas'] = ta s3_input_files['additional_file_parameters']['chip.tas'] = {"rename": rename_chip(ta)} # collect parameters if paired == 'single': chip_p = False elif paired == 'paired': chip_p = True parameters = { "atac.pipeline_type": 'atac', "atac.paired_end": chip_p, "atac.gensz": org, "atac.disable_ataqc": True, "atac.enable_xcor": False, } if paired == 'single': frag_temp = [300] fraglist = frag_temp * len(ta) parameters['atac.fraglen'] = fraglist s3_tag = set_acc # if complete, step1_output will have a list of 2 files, first_ta, and fist_ta_xcor keep, step3_status, step3_output = wfr_utils.stepper(library, keep, 'step3', s3_tag, ta, s3_input_files, step3_name, ['atac.optimal_peak', 'atac.conservative_peak', 'atac.sig_fc'], additional_input={'parameters': parameters}) if step3_status == 'complete': set_opt_peak = step3_output[0] set_cons_peak = step3_output[1] set_sig_fc = step3_output[2] # accumulate files to patch on experiment patch_data = [set_opt_peak, set_cons_peak, set_sig_fc] complete['patch_opf'].append([set_acc, patch_data]) complete['add_tag'] = [set_acc, tag] all_completed = True # unpack results missing_run = keep['missing_run'] running = keep['running'] problematic_run = keep['problematic_run'] if all_completed: final_status += ' completed' else: if missing_run: final_status += ' |Missing: ' + " ".join([i[0] for i in missing_run]) if running: final_status += ' |Running: ' + " ".join([i[0] for i in running]) if problematic_run: final_status += ' |Problem: ' + " ".join([i[0] for i in problematic_run]) # add dictionaries to main ones check.brief_output.append(final_status) print(final_status) if running: check.full_output['running_runs'].append({set_acc: running}) if missing_run: check.full_output['needs_runs'].append({set_acc: missing_run}) if problematic_run: check.full_output['problematic_runs'].append({set_acc: problematic_run}) # if made it till the end if complete.get('add_tag'): assert not running assert not problematic_run assert not missing_run check.full_output['completed_runs'].append(complete) # complete check values check.summary = "" if check.full_output['running_runs']: check.summary = str(len(check.full_output['running_runs'])) + ' running|' if check.full_output['skipped']: check.summary += str(len(check.full_output['skipped'])) + ' skipped|' check.status = 'WARN' if check.full_output['needs_runs']: check.summary += str(len(check.full_output['needs_runs'])) + ' missing|' check.status = 'WARN' check.allow_action = True if check.full_output['completed_runs']: check.summary += str(len(check.full_output['completed_runs'])) + ' completed|' check.status = 'WARN' check.allow_action = True if check.full_output['problematic_runs']: check.summary += str(len(check.full_output['problematic_runs'])) + ' problem|' check.status = 'WARN' return check
def repsets_have_bio_reps(connection, **kwargs): ''' Check for replicate experiment sets that have one of the following issues: 1) Only a single biological replicate (includes sets with single experiment) 2) Biological replicate numbers that are not in sequence 3) Technical replicate numbers that are not in sequence Action patches badges with a message detailing which of the above issues is relevant. ''' check = CheckResult(connection, 'repsets_have_bio_reps') results = ff_utils.search_metadata('search/?type=ExperimentSetReplicate&frame=object', key=connection.ff_keys, page_limit=50) audits = { REV_KEY: {'single_biorep': [], 'biorep_nums': [], 'techrep_nums': []}, RELEASED_KEY: {'single_biorep': [], 'biorep_nums': [], 'techrep_nums': []} } by_exp = {} for result in results: rep_dict = {} exp_audits = [] if result.get('replicate_exps'): rep_dict = {} for exp in result['replicate_exps']: if exp['bio_rep_no'] in rep_dict.keys(): rep_dict[exp['bio_rep_no']].append(exp['tec_rep_no']) else: rep_dict[exp['bio_rep_no']] = [exp['tec_rep_no']] if rep_dict: if result.get('status') in REV: audit_key = REV_KEY else: audit_key = RELEASED_KEY # check if single biological replicate if len(rep_dict.keys()) == 1: # this tag labels an ExpSet with many replicates, but only one present in the database (typically imaging datasets) if 'many_replicates' in result.get('tags', []): # skip false positive continue audits[audit_key]['single_biorep'].append(result['@id']) exp_audits.append('Replicate set contains only a single biological replicate') # check if bio rep numbers not in sequence if sorted(list(rep_dict.keys())) != list(range(min(rep_dict.keys()), max(rep_dict.keys()) + 1)): audits[audit_key]['biorep_nums'].append('{} - bio rep #s:' ' {}'.format(result['@id'], str(sorted(list(rep_dict.keys()))))) exp_audits.append('Biological replicate numbers are not in sequence') # check if tech rep numbers not in sequence for key, val in rep_dict.items(): if sorted(val) != list(range(min(val), max(val) + 1)): audits[audit_key]['techrep_nums'].append('{} - tech rep #s of biorep {}:' ' {}'.format(result['@id'], key, str(sorted(val)))) exp_audits.append('Technical replicate numbers of biological replicate {}' ' are not in sequence'.format(key)) if exp_audits and result.get('status') not in REV: by_exp[result['@id']] = sorted(exp_audits) to_add, to_remove, to_edit, ok = compare_badges_and_messages(by_exp, 'ExperimentSetReplicate', 'replicate-numbers', connection.ff_keys) check.action = 'patch_badges_for_replicate_numbers' if to_add or to_remove or to_edit: check.status = 'WARN' check.summary = 'Replicate number badges need patching' check.description = '{} replicate experiment sets need replicate badges patched'.format( len(to_add.values()) + len(to_remove.values()) + len(to_edit.values()) ) check.allow_action = True else: check.status = 'PASS' check.summary = 'Replicate number badges up-to-date' check.description = 'No replicate number badges need patching' check.full_output = {'Add badge': to_add, 'Remove badge': to_remove, 'Keep badge and edit messages': to_edit, 'Keep badge (no change)': len(ok)} check.brief_output = {REV_KEY: audits[REV_KEY]} check.brief_output[RELEASED_KEY] = { k: {'single_biorep': [], 'biorep_nums': [], 'techrep_nums': []} for k in check.full_output.keys() } for k, v in audits[RELEASED_KEY].items(): nochg_cnt = 0 for item in v: name = item.split(' ')[0] for key in ["Add badge", 'Remove badge', 'Keep badge and edit messages']: if name in check.full_output[key].keys(): check.brief_output[RELEASED_KEY][key][k].append(item) if name in ok: nochg_cnt += 1 check.brief_output[RELEASED_KEY]['Keep badge (no change)'][k] = nochg_cnt return check