Python search_metadata 예제들, dcicutils.ff_utils.search_metadata Python 예제들

예제 #1

0

파일 보기

파일: generate_items_from_owl.py 프로젝트: dbmi-bgm/cgap-portal

def get_existing_items_from_db(connection, itype, include_invisible=True):
    """ Retrieves all existing items of itype from db and returns a generator
        by default includes deleted and restricted terms which are usually
        filtered from search results
        include_invisible=False excludes deleted and restricted
        return Generator of item dicts
    """
    invisible_stati = ['deleted', 'replaced']
    gens = []
    search_suffix = 'search/?type={}'.format(itype)
    gens.append(
        search_metadata(search_suffix,
                        connection,
                        page_limit=200,
                        is_generator=True))
    if include_invisible:
        for istatus in invisible_stati:
            search_suffix += '&status={}'.format(istatus)
        gens.append(
            search_metadata(search_suffix,
                            connection,
                            page_limit=200,
                            is_generator=True))
    for gen in gens:
        yield from gen

예제 #2

0

파일 보기

파일: badge_checks.py 프로젝트: 4dn-dcic/foursight

def exp_has_raw_files(connection, **kwargs):
    '''
    Check for sequencing experiments that don't have raw files
    Action patches badges
    '''
    check = CheckResult(connection, 'exp_has_raw_files')
    # search all experiments except microscopy experiments for missing files field
    no_files = ff_utils.search_metadata('search/?type=Experiment&%40type%21=ExperimentMic&files.uuid=No+value',
                                        key=connection.ff_keys)
    # also check sequencing experiments whose files items are all uploading/archived/deleted
    bad_status = ff_utils.search_metadata('search/?status=uploading&status=archived&status=deleted&status=upload+failed'
                                          '&type=FileFastq&experiments.uuid%21=No+value',
                                          key=connection.ff_keys)
    bad_status_ids = {item['@id']: item['status'] for item in bad_status}
    exps = list(set([exp['@id'] for fastq in bad_status for exp in
                     fastq.get('experiments') if fastq.get('experiments')]))
    missing_files_released = [e['@id'] for e in no_files if e.get('status') not in REV]
    missing_files_in_rev = [e['@id'] for e in no_files if e.get('status') in REV]
    for expt in exps:
        result = ff_utils.get_metadata(expt, key=connection.ff_keys)
        raw_files = False
        if result.get('files'):
            for fastq in result.get('files'):
                if fastq['@id'] not in bad_status_ids or result['status'] == bad_status_ids[fastq['@id']]:
                    raw_files = True
                    break
        if not raw_files:
            if result.get('status') in REV:
                missing_files_in_rev.append(expt)
            else:
                missing_files_released.append(expt)

    to_add, to_remove, ok = compare_badges(missing_files_released, 'Experiment', 'no-raw-files', connection.ff_keys)

    if to_add or to_remove:
        check.status = 'WARN'
        check.summary = 'Raw Files badges need patching'
        check.description = '{} sequencing experiments need raw files badges patched'.format(
            len(to_add) + len(to_remove)
        )
        check.allow_action = True
    else:
        check.status = 'PASS'
        check.summary = 'Raw Files badges up-to-date'
        check.description = 'No sequencing experiments need raw files badges patched'
    check.action = 'patch_badges_for_raw_files'
    check.full_output = {'Add badge': to_add,
                         'Remove badge': to_remove,
                         'Keep badge': ok}
    check.brief_output = {REV_KEY: missing_files_in_rev,
                          RELEASED_KEY: {'Add badge': to_add, 'Remove badge': to_remove}}
    return check

예제 #3

0

파일 보기

def check_validation_errors(connection, **kwargs):
    '''
    Counts number of items in fourfront with schema validation errors,
    returns link to search if found.
    '''
    check = CheckResult(connection, 'check_validation_errors')

    search_url = 'search/?validation_errors.name!=No+value&type=Item'
    results = ff_utils.search_metadata(search_url + '&field=@id',
                                       key=connection.ff_keys)
    if results:
        types = {
            item
            for result in results for item in result['@type'] if item != 'Item'
        }
        check.status = 'WARN'
        check.summary = 'Validation errors found'
        check.description = (
            '{} items found with validation errors, comprising the following '
            'item types: {}. \nFor search results see link below.'.format(
                len(results), ', '.join(list(types))))
        check.ff_link = connection.ff_server + search_url
    else:
        check.status = 'PASS'
        check.summary = 'No validation errors'
        check.description = 'No validation errors found.'
    return check

예제 #4

0

파일 보기

def page_children_routes(connection, **kwargs):
    check = CheckResult(connection, 'page_children_routes')

    page_search = 'search/?type=Page&format=json&children.name%21=No+value'
    results = ff_utils.search_metadata(page_search, key=connection.ff_keys)
    problem_routes = {}
    for result in results:
        if result['name'] != 'resources/data-collections':
            bad_children = [
                child['name'] for child in result['children']
                if child['name'] != result['name'] + '/' +
                child['name'].split('/')[-1]
            ]
            if bad_children:
                problem_routes[result['name']] = bad_children

    if problem_routes:
        check.status = 'WARN'
        check.summary = 'Pages with bad routes found'
        check.description = (
            '{} child pages whose route is not a direct sub-route of parent'
            ''.format(sum([len(val) for val in problem_routes.values()])))
    else:
        check.status = 'PASS'
        check.summary = 'No pages with bad routes'
        check.description = 'All routes of child pages are a direct sub-route of parent page'
    check.full_output = problem_routes
    return check

예제 #5

0

파일 보기

파일: badge_checks.py 프로젝트: 4dn-dcic/foursight

def compare_badges_and_messages(obj_id_dict, item_type, badge, ff_keys):
    '''
    Compares items that should have a given badge to items that do have the given badge.
    Also compares badge messages to see if the message is the right one or needs to be updated.
    Input (first argument) should be a dictionary of item's @id and the badge message it should have.
    '''
    search_url = 'search/?type={}&badges.badge.@id=/badges/{}/'.format(item_type, badge)
    has_badge = ff_utils.search_metadata(search_url + '&frame=object', key=ff_keys)
    needs_badge = {}
    badge_edit = {}
    badge_ok = []
    remove_badge = {}
    for item in has_badge:
        if item['@id'] in obj_id_dict.keys():
            # handle differences in badge messages
            for a_badge in item['badges']:
                if a_badge['badge'].endswith(badge + '/'):
                    if a_badge.get('messages') == obj_id_dict[item['@id']]:
                        badge_ok.append(item['@id'])
                    else:
                        if a_badge.get('message'):
                            del a_badge['message']
                        a_badge['messages'] = obj_id_dict[item['@id']]
                        badge_edit[item['@id']] = item['badges']
                    break
        else:
            this_badge = [a_badge for a_badge in item['badges'] if badge in a_badge['badge']][0]
            item['badges'].remove(this_badge)
            remove_badge[item['@id']] = item['badges']
    for key, val in obj_id_dict.items():
        if key not in badge_ok + list(badge_edit.keys()):
            needs_badge[key] = val
    return needs_badge, remove_badge, badge_edit, badge_ok

예제 #6

0

파일 보기

파일: file_format.py 프로젝트: 4dn-dcic/tibanna_ff

 def __init__(self, ff_keys=None, ffe_all=None):
     """connect to the server and get all fileformat search result if ff_keys
     if given. If not, use user-specified ffe_all
     """
     if not ff_keys and not ffe_all:
         raise Exception("Either ff_keys or ffe_all must be specified" + \
                         "to create a FormatExtensionMap object")
     if ff_keys and ffe_all:
         raise Exception("Either ff_keys or ffe_all must be specified but not both" + \
                         "to create a FormatExtensionMap object")
     if ff_keys and not ffe_all:
         try:
             logger.debug("Searching in server : " + ff_keys['server'])
             ffe_all = search_metadata(
                 "/search/?type=FileFormat&frame=object", key=ff_keys)
         except Exception as e:
             raise Exception(
                 "Can't get the list of FileFormat objects. %s\n" % e)
     self.fe_dict = dict()
     logger.debug("**ffe_all = " + str(ffe_all))
     for k in ffe_all:
         file_format = k['file_format']
         self.fe_dict[file_format] = \
             {'standard_extension': k['standard_file_extension'],
              'other_allowed_extensions': k.get('other_allowed_extensions', []),
              'extrafile_formats': k.get('extrafile_formats', [])
              }

예제 #7

0

파일 보기

파일: parse_hpoa.py 프로젝트: dbmi-bgm/cgap-portal

def compare_existing_to_newly_generated(logger, connection, evidence_items,
                                        itype):
    """ gets all the existing evidence items from database and compares to all the newly
        generated ones from annotations and if found removes from list
    """
    sq = 'search/?type={}&status!=obsolete'.format(itype)
    logger.info("COMPARING FILE ITEMS WITH CURRENT DB CONTENT")
    logger.info("searching: {}".format(
        datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")))
    dbitems = search_metadata(sq,
                              connection,
                              is_generator=True,
                              page_limit=500)
    existing = 0
    uids2obsolete = []
    logger.info("comparing: {}".format(
        datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")))
    for db_evi in dbitems:
        # import pdb; pdb.set_trace()
        tochk = convert2raw(db_evi)
        if tochk in evidence_items:
            existing += 1
            evidence_items.remove(tochk)
        else:
            uids2obsolete.append(db_evi.get('uuid'))
    logger.info("result: {}".format(
        datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")))
    return evidence_items, existing, uids2obsolete

예제 #8

0

파일 보기

파일: script_utils.py 프로젝트: aschroed/pyscripts

def safe_search_with_callback(fdn_conn,
                              query,
                              container,
                              callback,
                              limit=20,
                              frame='embedded'):
    """
    Somewhat temporary function to avoid making search queries that cause
    memory issues. Takes a ff_utils fdn_conn, a search query (without 'limit' or
    'from' parameters), a container to put search results in after running
    them through a given callback function, which should take a search hit as
    its first parameter and the container as its second parameter.
    """
    last_total = None
    curr_from = 0
    while not last_total or last_total == limit:
        print('...', curr_from)
        search_query = ''.join(
            [query, '&from=',
             str(curr_from), '&limit=',
             str(limit)])
        search_res = search_metadata(search_query,
                                     connection=fdn_conn,
                                     frame=frame)
        if not search_res:  # 0 results
            break
        last_total = len(search_res)
        curr_from += last_total
        for hit in search_res:
            callback(hit, container)

예제 #9

0

파일 보기

def experiment_set_reporting_data(connection, **kwargs):
    """
    Get a snapshot of all experiment sets, their experiments, and files of
    all of the above. Include uuid, accession, status, and md5sum (for files).
    """
    check = CheckResult(connection, 'experiment_set_reporting_data')
    check.status = 'IGNORE'
    exp_sets = {}
    search_query = '/search/?type=ExperimentSetReplicate&experimentset_type=replicate&sort=-date_created'
    set_hits = ff_utils.search_metadata(search_query, key=connection.ff_keys, page_limit=20)
    # run a second search for status=deleted and status=replaced
    set_hits_del = ff_utils.search_metadata(search_query + '&status=deleted&status=replaced',
                                            key=connection.ff_keys, page_limit=20)
    set_hits.extend(set_hits_del)
    for hit in set_hits:
        add_to_report(hit, exp_sets)
    check.full_output = exp_sets
    return check

예제 #10

0

파일 보기

파일: generate_ontology.py 프로젝트: hms-dbmi/encode

def get_existing_ontology_terms(connection, ontologies=None):
    '''Retrieves all existing ontology terms from the db
    '''
    ont_list = ''
    if ontologies is not None:
        for o in ontologies:
            ouuid = o.get('uuid')
            ont_list += '&source_ontology.uuid={}'.format(ouuid)
    search_suffix = 'search/?type=OntologyTerm' + ont_list
    db_terms = search_metadata(search_suffix, connection, page_limit=200, is_generator=True)
    return {t['term_id']: t for t in db_terms}

예제 #11

0

파일 보기

def get_item_ids_from_args(id_input, auth, is_search=False):
    '''depending on the args passed return a list of item ids'''
    if is_search:
        query = 'search/?' + id_input[0]
        result = search_metadata(query, auth, is_generator=True)
        return [r.get('uuid') for r in result]
    try:
        with open(id_input[0]) as inf:
            return [x.strip() for x in inf]  # pragma: no cover
    except FileNotFoundError:
        return id_input

예제 #12

0

파일 보기

def paired_end_info_consistent(connection, **kwargs):
    '''
    Check that fastqs with a paired_end number have a paired_with related_file, and vice versa
    '''
    check = CheckResult(connection, 'paired_end_info_consistent')

    search1 = 'search/?type=FileFastq&file_format.file_format=fastq&related_files.relationship_type=paired+with&paired_end=No+value'
    search2 = 'search/?type=FileFastq&file_format.file_format=fastq&related_files.relationship_type!=paired+with&paired_end%21=No+value'

    results1 = ff_utils.search_metadata(search1 + '&frame=object',
                                        key=connection.ff_keys)
    results2 = ff_utils.search_metadata(search2 + '&frame=object',
                                        key=connection.ff_keys)

    results = {
        'paired with file missing paired_end number':
        [result1['@id'] for result1 in results1],
        'file with paired_end number missing "paired with" related_file':
        [result2['@id'] for result2 in results2]
    }

    if [val for val in results.values() if val]:
        check.status = 'WARN'
        check.summary = 'Inconsistencies found in FileFastq paired end info'
        check.description = (
            '{} files found with a "paired with" related_file but missing a paired_end number; '
            '{} files found with a paired_end number but missing related_file info'
            ''.format(
                len(results['paired with file missing paired_end number']),
                len(results[
                    'file with paired_end number missing "paired with" related_file']
                    )))
    else:
        check.status = 'PASS'
        check.summary = 'No inconsistencies in FileFastq paired end info'
        check.description = 'All paired end fastq files have both paired end number and "paired with" related_file'
    check.full_output = results
    check.brief_output = [item for val in results.values() for item in val]
    return check

예제 #13

0

파일 보기

파일: investigate_variants.py 프로젝트: burakalver/cgap_scratch

def search_result(params):
    '''
    Assuming the <KEYNAME> in the <keyfilename> is a valid admin key for cgapwolf.
    Perform a search based on params, e.g. {"type": "Gene"} and return result.
    '''
    keyfilename = path.expanduser("~") + '/keypairs.json'
    with open(keyfilename) as keyfile:
        keys = json.load(keyfile)
        key = keys[KEYNAME]
    base_url = "/search/"
    query = "%s?%s" % (base_url, urlencode(params))
    result = ff_utils.search_metadata(query, key=key)
    return result

예제 #14

0

파일 보기

파일: pony_utils.py 프로젝트: VonRosenchild/tibanna

def batch_fastqc(env, batch_size=20):
    '''
    try to run fastqc on everythign that needs it ran
    '''
    files_processed = 0
    files_skipped = 0

    # handle ctrl-c
    import signal

    def report(signum, frame):
        print("Processed %s files, skipped %s files" %
              (files_processed, files_skipped))
        sys.exit(-1)

    signal.signal(signal.SIGINT, report)

    tibanna = Tibanna(env=env)
    uploaded_files = search_metadata(
        "search/?type=File&status=uploaded&limit=%s" % batch_size,
        key=tibanna.ff_key,
        ff_env=tibanna.env)

    # TODO: need to change submit 4dn to not overwrite my limit
    if len(uploaded_files['@graph']) > batch_size:
        limited_files = uploaded_files['@graph'][:batch_size]
    else:
        limited_files = uploaded_files['@graph']

    for ufile in limited_files:
        fastqc_run = False
        for wfrun in ufile.get('workflow_run_inputs', []):
            if 'fastqc' in wfrun:
                fastqc_run = True
        if not fastqc_run:
            print("running fastqc for %s" % ufile.get('accession'))
            run_fastqc(env, ufile.get('accession'), ufile.get('uuid'))
            files_processed += 1
        else:
            print("******** fastqc already run for %s skipping" %
                  ufile.get('accession'))
            files_skipped += 1
        sleep(5)
        if files_processed % 10 == 0:
            sleep(60)

    print("Processed %s files, skipped %s files" %
          (files_processed, files_skipped))

예제 #15

0

파일 보기

파일: pony_utils.py 프로젝트: VonRosenchild/tibanna

 def __init__(self, ff_keys):
     try:
         printlog("Searching in server : " + ff_keys['server'])
         ffe_all = search_metadata("/search/?type=FileFormat&frame=object",
                                   key=ff_keys)
     except Exception as e:
         raise Exception("Can't get the list of FileFormat objects. %s\n" %
                         e)
     self.fe_dict = dict()
     printlog("**ffe_all = " + str(ffe_all))
     for k in ffe_all:
         file_format = k['file_format']
         self.fe_dict[file_format] = \
             {'standard_extension': k['standard_file_extension'],
              'other_allowed_extensions': k.get('other_allowed_extensions', []),
              'extrafile_formats': k.get('extrafile_formats', [])
              }

예제 #16

0

파일 보기

파일: generate_ontology.py 프로젝트: hms-dbmi/encode

def get_ontologies(connection, ont_list):
    '''return list of ontology jsons retrieved from server
        ontology jsons are now fully embedded
    '''
    ontologies = []
    if ont_list == 'all':
        ontologies = search_metadata('search/?type=Ontology', connection)
    else:
        ontologies = [get_metadata('ontologys/' + ontology, connection) for ontology in ont_list]
    # removing item not found cases with reporting
    if not isinstance(ontologies, (list, tuple)):
        print("we must not have got ontolgies... bailing")
        import sys
        sys.exit()
    for i, ontology in enumerate(ontologies):
        if 'Ontology' not in ontology['@type']:
            ontologies.pop(i)
    return ontologies

예제 #17

0

파일 보기

파일: get_field_info.py 프로젝트: mccalluc/Submit4DN

 def __init__(self, connection, schema_name):
     uri = '/profiles/' + schema_name + '.json'
     response = ff_utils.get_metadata(uri,
                                      key=connection.key,
                                      add_on="frame=object")
     self.required = None
     if 'required' in response:
         self.required = response['required']
     if schema_name in file_types and response['properties'].get(
             'file_format'):
         q = '/search/?type=FileFormat&field=file_format&valid_item_types={}'.format(
             schema_name)
         formats = [
             i['file_format']
             for i in ff_utils.search_metadata(q, key=connection.key)
         ]
         response['properties']['file_format']['enum'] = formats
     self.properties = response['properties']

예제 #18

0

파일 보기

파일: generate_ontology.py 프로젝트: hms-dbmi/encode

def get_slim_terms(connection):
    '''Retrieves ontology_term jsons for those terms that have 'is_slim_for'
        field populated
    '''
    # currently need to hard code the categories of slims but once the ability
    # to search all can add parameters to retrieve all or just the terms in the
    # categories passed as a list
    slim_categories = ['developmental', 'assay', 'organ', 'system', 'cell']
    search_suffix = 'search/?type=OntologyTerm&is_slim_for='
    slim_terms = []
    for cat in slim_categories:
        try:
            terms = search_metadata(search_suffix + cat, connection)
            slim_terms.extend(terms)
        except TypeError as e:
            print(e)
            continue
    return slim_terms

예제 #19

0

파일 보기

파일: badge_checks.py 프로젝트: 4dn-dcic/foursight

def compare_badges(obj_ids, item_type, badge, ff_keys):
    '''
    Compares items that should have a given badge to items that do have the given badge.
    Used for badges that utilize a single message choice.
    Input (first argument) should be a list of item @ids.
    '''
    search_url = 'search/?type={}&badges.badge.@id=/badges/{}/'.format(item_type, badge)
    has_badge = ff_utils.search_metadata(search_url + '&frame=object', key=ff_keys)
    needs_badge = []
    badge_ok = []
    remove_badge = {}
    for item in has_badge:
        if item['@id'] in obj_ids:
            # handle differences in badge messages
            badge_ok.append(item['@id'])
        else:
            keep = [badge_dict for badge_dict in item['badges'] if badge not in badge_dict['badge']]
            remove_badge[item['@id']] = keep
    for other_item in obj_ids:
        if other_item not in badge_ok:
            needs_badge.append(other_item)
    return needs_badge, remove_badge, badge_ok

예제 #20

0

파일 보기

파일: badge_checks.py 프로젝트: 4dn-dcic/foursight

def gold_biosamples(connection, **kwargs):
    '''
    Gold level commendation criteria:
    1. Tier 1 or Tier 2 Cells obtained from the approved 4DN source and grown
    precisely according to the approved SOP including any additional
    authentication (eg. HAP-1 haploid line requires ploidy authentication).
    2. All required metadata present (does not have a biosample warning badge).
    '''
    check = CheckResult(connection, 'gold_biosamples')

    search_url = ('search/?biosource.cell_line_tier=Tier+1&biosource.cell_line_tier=Tier+2'
                  '&type=Biosample&badges.badge.warning=No+value')
    results = ff_utils.search_metadata(search_url, key=connection.ff_keys)
    gold = []
    for result in results:
        # follows SOP w/ no deviations
        sop = True if all([bcc.get('follows_sop', '') == 'Yes' for bcc in result.get('cell_culture_details', [])]) else False
        if sop and result.get('status') not in REV:
            gold.append(result['@id'])
    to_add, to_remove, ok = compare_badges(gold, 'Biosample', 'gold-biosample', connection.ff_keys)
    check.action = 'patch_gold_biosample_badges'
    if to_add or to_remove:
        check.status = 'WARN'
        check.summary = 'Gold biosample badges need patching'
        check.description = '{} biosamples need gold badges patched. '.format(len(to_add) + len(to_remove.keys()))
        check.description += 'Yellow_flag_biosamples check must pass before patching.'
        yellow_check = CheckResult(connection, 'yellow_flag_biosamples')
        latest_yellow = yellow_check.get_latest_result()
        if latest_yellow['status'] == 'PASS':
            check.allow_action = True
    else:
        check.status = 'PASS'
        check.summary = 'Gold biosample badges up-to-date'
        check.description = 'No gold biosample badges need patching'
    check.full_output = {'Add badge': to_add,
                         'Remove badge': to_remove,
                         'Keep badge (no change)': ok}
    return check

예제 #21

0

파일 보기

파일: google_utils.py 프로젝트: 4dn-dcic/foursight

        def get_latest_tracking_item_date(self, increment="daily"):
            """
            Queries '/search/?type=TrackingItem&sort=-google_analytics.for_date&&google_analytics.date_increment=...'
            to get date of last TrackingItem for increment in database.

            TODO: Accept yearly once we want to collect & viz it.
            """
            if increment not in ('daily', 'monthly'):
                raise IndexError("increment parameter must be one of 'daily', 'monthly'")

            search_results = ff_utils.search_metadata(
                '/search/?type=TrackingItem&tracking_type=google_analytics&sort=-google_analytics.for_date&limit=1&google_analytics.date_increment=' + increment,
                key=dict(self.owner.access_key, server=self.owner.server),
                page_limit=1
            )
            if len(search_results) == 0:
                return None

            iso_date = search_results[0]['google_analytics']['for_date']

            # TODO: Use date.fromisoformat() once we're on Python 3.7
            year, month, day = iso_date.split('-', 2) # In python, months are indexed from 1 <= month <= 12, not 0 <= month <= 11 like in JS.
            return date(int(year), int(month), int(day))

예제 #22

0

파일 보기

def main():

    # getting authentication keys
    args = get_args()
    try:
        auth = ff_utils.get_authentication_with_server(args.key)
    except Exception as e:
        print("Authentication failed", e)
        sys.exit(1)

    dryrun = args.dryrun
    if dryrun:
        print("\nThis is a dry run\n")

    # collecting publication and expset search results
    hic_types = [
        'in+situ+Hi-C', 'Dilution+Hi-C', 'DNase+Hi-C', 'Micro-C', 'TCC'
    ]
    query_pub = '/search/?type=Publication'
    query_exp = '/search/?type=ExperimentSetReplicate&status=released'
    for type in hic_types:
        query_pub += '&exp_sets_prod_in_pub.experiments_in_set.experiment_type.display_title=' + type
        query_exp += '&experiments_in_set.experiment_type.display_title=' + type
    pubs_search = ff_utils.search_metadata(query_pub, key=auth)
    expsets_search = ff_utils.search_metadata(query_exp, key=auth)

    # building publications dictionary
    pubs_dict = convert_pubs_list_to_lookup(pubs_search)

    # loading dataset groups from json file
    repo_path = Path(__file__).resolve().parents[1]
    dsg_filename = repo_path.joinpath('files', 'dsg.json')
    if dsg_filename.exists():
        with open(dsg_filename) as dsg_fn:
            dsgs = json.load(dsg_fn)
    else:
        sys.exit("ERROR: Dataset grouping file not found")

    # making dataset list and mapping to dataset group
    dataset_list = []
    datasets_of_dsg = {}
    for k, v in dsgs.items():
        if v.get("datasets"):
            dataset_list.extend(v["datasets"])
            datasets_of_dsg[k] = v["datasets"]
        else:
            # if a dsg does not have datasets, then the dsg itself is the dataset
            dataset_list.append(k)

    # building the output table
    table = {}
    new_datasets = set()
    study_groups = set()

    for expset in expsets_search:
        dataset = expset.get("dataset_label")
        if dataset not in dataset_list:
            new_datasets.add(dataset)
            continue

        dsg = dataset
        dsg_link = "dataset_label=" + dataset
        for group, elements in datasets_of_dsg.items():
            if dataset in elements:
                dsg_link = ("dataset_label=" +
                            "&dataset_label=".join(elements))
                dsg = group
                break
        dsg_link = "/browse/?" + dsg_link.replace("+", "%2B").replace(
            "/", "%2F").replace(" ", "+")

        study_groups.add(dsgs[dsg].get("study_group"))

        row = table.get(dsg, {})
        table[dsg] = assemble_data_for_the_row(row, expset, dsg, dsg_link,
                                               pubs_dict, dsgs[dsg])

    # summarize number of experiment sets of each experiment type in a string
    for dsg, row in table.items():
        exp_type_summary = ""
        for exp_type, count in row["Replicate Sets"].items():
            if count > 0:
                exp_type_summary += str(count) + " " + exp_type + "<br>"
        if len(exp_type_summary) > 0:
            row['Replicate Sets'] = exp_type_summary[:
                                                     -4]  #remove <br> at the end
        else:
            row['Replicate Sets'] = ""

    # if new datasets are not in the json, ask what to do
    if new_datasets:
        print("New datasets found (not present in the json file):")
        for ds in new_datasets:
            print(ds)
        print("(i)gnore datasets or (e)xit to manually add them? [i/e]")
        response = None
        while response not in ['i', 'e']:
            response = input()
        if response == 'e':
            sys.exit("Add new dataset to dsg.json before generating table")

    # patch the static section for each study group
    skipped = []
    posted = []
    patched = []
    for studygroup in list(study_groups):

        # prepare static section
        table_dsg = {}
        for dsg in dsgs:
            if table.get(dsg):
                if table[dsg].get("Class") != studygroup:
                    continue
                else:
                    table_dsg[dsg] = table.get(dsg)

        keys = [
            'Data Set', 'Project', 'Replicate Sets', 'Species', 'Biosources',
            'Publication', 'Study', 'Lab'
        ]
        if studygroup == "Single Time Point and Condition":
            keys.remove('Study')

        name = alias = output = filetype = None
        if args.format == 'markdown':
            name = "data-highlights.hic." + studygroup + ".md"
            name = name.lower().replace(" ", "-")
            alias = "4dn-dcic-lab:" + name
            filetype = 'jsx'
            default_col_widths = "[-1,100,-1,100,-1,-1,-1,-1]"
            if "Study" not in keys:
                default_col_widths = "[-1,100,-1,120,250,-1,170]"
            output = md_table_maker(table_dsg, keys, name, default_col_widths)
        else:
            name = "data-highlights.hic." + studygroup
            name = name.lower().replace(" ", "-")
            alias = "4dn-dcic-lab:" + name
            filetype = 'html'
            styles = {
                'Data Set': ";width:20%;min-width:120px",
                'Replicate Sets': ";width:150px",
                'Publication': ";width:200px"
            }
            output = html_table_maker(table_dsg, keys, styles)

        # check if static section exists
        post = False
        try:
            ff_utils.get_metadata(alias, auth)
        except Exception:
            print(
                "'{}' static section cannot be patched because it does not exist"
                .format(studygroup))
            print("Do you want to (p)ost or (s)kip this static section? [p/s]")
            response = None
            while response not in ['p', 's']:
                response = input()
            if response == 's':
                skipped.append(alias)
                continue
            else:
                post = True

        # post or patch static section
        if post:
            post_body = {
                "name": name,
                "aliases": [alias],
                "body": output,
                "section_type": "Page Section",
                "title": studygroup,
                "options": {
                    "collapsible": True,
                    "default_open": True,
                    "filetype": filetype
                }
            }
            if not dryrun:
                res = ff_utils.post_metadata(post_body,
                                             "StaticSection",
                                             key=auth)
            posted.append(alias)
        else:
            patch_body = {"body": output}
            if not dryrun:
                res = ff_utils.patch_metadata(patch_body, alias, key=auth)
            patched.append(alias)
        if not dryrun:
            print("{}: {}".format(alias, res['status']))

    # summarize results
    print("Static sections summary: {} patched, {} posted, {} skipped".format(
        len(patched), len(posted), len(skipped)))
    if posted:
        print(
            "Remember to add the new static section(s) to the hic-data-overview page:"
        )
        for item in posted:
            print(item)
    if skipped:
        print("Skipped sections:")
        for item in skipped:
            print(item)

예제 #23

0

파일 보기

def find_items_for_header_processing(connection,
                                     check,
                                     header,
                                     add_search=None,
                                     remove_search=None,
                                     append=True):
    """
    (add_search) and remove them from others (remove_search).
    Args are:
    - connection (FS connection)
    - check (required; check object initialized by CheckResult)
    - headers @id (required)
    - add_search search query
    - remove_search search query
    Meant to be used for CHECKS
    """
    # sets the full_output of the check!
    check.full_output = {
        'static_section': header,
        'to_add': {},
        'to_remove': {}
    }
    # this GET will fail if the static header does not exist
    header_res = ff_utils.get_metadata(header, key=connection.ff_keys)
    # add entries keyed by item uuid with value of the static headers
    if add_search:
        search_res_add = ff_utils.search_metadata(add_search,
                                                  key=connection.ff_keys)
        for search_res in search_res_add:
            curr_headers = search_res.get('static_headers', [])
            # handle case where frame != object
            if curr_headers and isinstance(curr_headers[0], dict):
                curr_headers = [obj['@id'] for obj in curr_headers]
            if header not in curr_headers:
                curr_headers = curr_headers + [header] if append else [
                    header
                ] + curr_headers
                check.full_output['to_add'][search_res['@id']] = curr_headers

    if remove_search:
        search_res_remove = ff_utils.search_metadata(remove_search,
                                                     key=connection.ff_keys)
        for search_res in search_res_remove:
            curr_headers = search_res.get('static_headers', [])
            # handle case where frame != object
            if curr_headers and isinstance(curr_headers[0], dict):
                curr_headers = [obj['@id'] for obj in curr_headers]
            if header in curr_headers:
                curr_headers.remove(header)
                check.full_output['to_remove'][
                    search_res['@id']] = curr_headers

    if check.full_output['to_add'] or check.full_output['to_remove']:
        check.status = 'WARN'
        check.summary = 'Ready to add and/or remove static header'
        check.description = 'Ready to add and/or remove static header: %s' % header
        check.allow_action = True
        check.action_message = 'Will add static header to %s items and remove it from %s items' % (
            len(check.full_output['to_add']),
            len(check.full_output['to_remove']))
    else:
        check.status = 'PASS'
        check.summary = 'Static header is all set'

예제 #24

0

파일 보기

def initialize_user_content(spawner):
    """
    Used to initialize the users s3-backed notebook storage.
    For initialization, ensure all notebook templates are copied
    (check every time)
    In addition, load access keys from Fourfront and add them to the
    environment variables of the notebook. Also delete previously created
    access keys used for Jupyterhub for the user
    Also initialized a TrackingItem of type jupyterhub_session to track some
    basic information on the JH session
    """
    err_output = []  # keep track of errors for debugging

    # grab this info fresh every time
    ff_keys = recompute_ff_keys(err_output)

    username = spawner.user.name  # get the username
    list_res = s3_client.list_objects_v2(
        Bucket=os.environ['AWS_TEMPLATE_BUCKET'])

    # check each template individually
    for template_res in list_res.get('Contents', []):
        template_key = template_res['Key']
        user_subdir = 'user-' + escape_string(username)
        notebook_temp_key = '/'.join([user_subdir, template_key])
        source_info = {
            "Bucket": os.environ['AWS_TEMPLATE_BUCKET'],
            "Key": template_key
        }
        try:  # always replace templates
            s3_client.copy_object(Bucket=os.environ["AWS_NOTEBOOK_BUCKET"],
                                  Key=notebook_temp_key,
                                  CopySource=source_info)
        except Exception as copy_exc:
            err_output.append({'copying_templates': str(copy_exc)})

    # get the access keys and set them as environment variables for the user
    try:
        ff_user = ff_utils.get_metadata('/users/' + username, key=ff_keys)
    except Exception as user_exc:
        err_output.append({'getting_user': str(user_exc)})
        clear_old_access_keys(
        )  # if we get here, old access key state must be cleared.
    else:
        key_descrip = 'jupyterhub_key'
        search_q = ''.join([
            '/search/?type=AccessKey&status=current&description=', key_descrip,
            '&user.uuid=', ff_user['uuid']
        ])
        try:
            user_keys = ff_utils.search_metadata(search_q, key=ff_keys)
        except Exception as search_exc:
            err_output.append({'searching_keys': str(search_exc)})
        else:
            for ukey in user_keys:
                try:
                    ff_utils.patch_metadata({'status': 'deleted'},
                                            ukey['uuid'],
                                            key=ff_keys)
                except Exception as patch_exc:
                    err_output.append({'deleting_keys': str(patch_exc)})
        # access key will be submitted by 4dn-dcic admin but belong to user
        key_body = {'user': ff_user['uuid'], 'description': key_descrip}
        try:
            key_res = ff_utils.post_metadata(key_body,
                                             'access-keys',
                                             key=ff_keys)
        except Exception as key_exc:
            err_output.append({'post_key': str(key_exc)})
            clear_old_access_keys(
            )  # if we get here, old access key state must be cleared.
        else:
            os.environ['FF_ACCESS_KEY'] = key_res['access_key_id']
            os.environ['FF_ACCESS_SECRET'] = key_res['secret_access_key']

        # intialize a tracking item for the session and store its uuid in env
        # set `submitted_by` manually to allow user to edit
        tracking_body = {
            'jupyterhub_session': {
                'date_initialized':
                datetime.datetime.utcnow().isoformat() + '+00:00',
                'user_uuid': ff_user['uuid']
            },
            'tracking_type': 'jupyterhub_session',
            'submitted_by': ff_user['uuid']
        }
        try:
            track_res = ff_utils.post_metadata(tracking_body,
                                               'tracking-items',
                                               key=ff_keys)
        except Exception as track_exc:
            err_output.append({'tracking_item': str(track_exc)})
        else:
            os.environ['FF_TRACKING_ID'] = track_res['@graph'][0]['uuid']

    os.environ['INIT_ERR_OUTPUT'] = json.dumps(err_output)

예제 #25

0

파일 보기

파일: update_inserts_from_server.py 프로젝트: hms-dbmi/encode

def main():
    """
    Use this command to update the inserts from a given fourfront env
    """
    logging.basicConfig()
    # Loading app will have configured from config file. Reconfigure here:
    logging.getLogger('encoded').setLevel(logging.DEBUG)

    parser = argparse.ArgumentParser(
        description="Update Inserts", epilog=EPILOG,
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    parser.add_argument('--env', default='data',
                        help='FF environment to update from. Defaults to data')
    parser.add_argument('--dest', default='temp-local-inserts',
                        help="destination file in inserts dir to write to")
    parser.add_argument('--item-type', action='append', default=[],
                        help="item type, e.g. file_fastq. Defaults to all types")
    parser.add_argument('--ignore-field', action='append', default=[],
                        help='field name to ignore when running expand_es_metadata')
    parser.add_argument('--from-search', help='query passed to search_metadata to find uuids')

    args = parser.parse_args()
    # this will work since bin/ commands are run from root FF directory
    inserts_location = 'src/encoded/tests/data'
    # hardcode these to prevent accidental creation of inserts files
    inserts_files = ['inserts', 'master-inserts', 'perf-testing',
                     'workbook-inserts', 'temp-local-inserts']
    if args.dest not in inserts_files:
        raise Exception('Specified inserts destination %s must be one of: %s'
                        % (args.dest, inserts_files))
    inserts_path = '/'.join([inserts_location, args.dest])

    local_inserts, item_uuids = read_local_inserts_dir(args.dest, inserts_path, args.item_type)

    # add uuids from the input search result, if present
    if args.from_search:
        use_search = args.from_search
        # get frame=object search results to keep response small
        if 'frame=' not in use_search:
            use_search += '&frame=object'
        search_res = search_metadata(use_search, ff_env=args.env)
        search_uuids = [item['uuid'] for item in search_res]
        logger.info('update_inserts: Will update using %s items from search' % len(search_uuids))
        item_uuids = list(set(item_uuids + search_uuids))

    # now find uuids and all linked from the given server
    svr_inserts, svr_uuids = expand_es_metadata(item_uuids, ff_env=args.env,
                                                store_frame='raw', add_pc_wfr=True,
                                                ignore_field=args.ignore_field)

    # if we are updating `inserts`, must make sure that items don't conflict
    # with those in `master-inserts`
    skip_uuids = set()
    if args.dest == 'inserts':
        master_path = '/'.join([inserts_location, 'master-inserts'])
        master_inserts, master_uuids = read_local_inserts_dir('master-inserts', master_path)
        item_conflict_report = {}
        for item_type in svr_inserts:
            itype_err = []
            itype_okay = []
            conflicting_items = [item for item in svr_inserts[item_type] if item['uuid'] in master_uuids]
            for conflict in conflicting_items:
                # compare inserts by loading json objects
                svr_json = json.dumps(conflict, sort_keys=True)
                mstr_json = json.dumps(master_inserts[item_type][conflict['uuid']], sort_keys=True)
                if svr_json != mstr_json:
                    itype_err.append(conflict['uuid'])
                else:
                    # the json is the same. Remove from the `inserts` update
                    skip_uuids.add(conflict['uuid'])
                    itype_okay.append(conflict['uuid'])
            item_conflict_report[item_type] = {'error': itype_err, 'okay': itype_okay}
        if any([it for it in item_conflict_report if item_conflict_report[it]['error']]):
            error_report = {it: item_conflict_report[it]['error'] for it in item_conflict_report}
            logger.error('update_inserts: Cannot update the following items in "inserts" directory,'
                            ' since there are conflicting items with different values'
                            'in the master-inserts. Update those first. Conflicts:\n%s' % json.dumps(error_report, indent=4))
            raise Exception('Cannot load inserts as there are conflicting items in `master-inserts`')
        elif any([it for it in item_conflict_report if item_conflict_report[it]['okay']]):
            conflict_report = {it: item_conflict_report[it]['okay'] for it in item_conflict_report}
            logger.warning('update_inserts: The following items are already in "master-inserts".'
                           ' Will not add to "inserts". Items:\n%s' % json.dumps(conflict_report, indent=4))

    # now we need to update the server inserts with contents from local inserts
    # so that existing information is not lost
    for item_type in svr_inserts:
        # remove items specified by skip uuids
        if skip_uuids:
            svr_inserts[item_type] = [insrt for insrt in svr_inserts[item_type]
                                      if insrt['uuid'] not in skip_uuids]
        for item_uuid in local_inserts.get(item_type, {}):
            if item_uuid not in svr_uuids and item_uuid not in skip_uuids:
                svr_inserts[item_type].append(local_inserts[item_type][item_uuid])
    dump_results_to_json(svr_inserts, inserts_path)
    logger.info('update_inserts: Successfully wrote to %s' % inserts_path)
    for item_type in svr_inserts:
        logger.info('update_inserts: Wrote %s items to %s' %
                    (len(svr_inserts[item_type]), item_type + '.json'))

예제 #26

0

파일 보기

파일: cleanup.py 프로젝트: 4dn-dcic/dcicwrangling

def delete_wfrs(file_resp, my_key, delete=False, stash=None):
    # file_resp in embedded frame
    # stash: all related wfrs for file_resp
    deleted_wfrs = []
    wfr_report = []
    file_type = file_resp['@id'].split('/')[1]
    # special clause until we sort input_wfr_switch issue
    # do not delete output wfrs of control files
    output_wfrs = file_resp.get('workflow_run_outputs')
    if not output_wfrs:
        if file_type == 'files-processed':
            # user submtted processed files
            return
        else:
            # raw files:
            pass
    else:
        output_wfr = output_wfrs[0]
        wfr_type, time_info = output_wfr['display_title'].split(' run ')
        if wfr_type == 'encode-chipseq-aln-ctl 1.1.1':
            print('skipping control file for wfr check',
                  file_resp['accession'])
            return

    wfr_uuids = [i['uuid'] for i in file_resp.get('workflow_run_inputs')]
    wfrs = []
    if wfr_uuids:
        # fetch them from stash
        if stash:
            wfrs = [i for i in stash if i['uuid'] in wfr_uuids]
            assert len(wfrs) == len(wfr_uuids)
        # if no stash, get from database
        else:
            wfrs = [
                i['embedded'] for i in ff_utils.get_es_metadata(
                    wfr_uuids, sources=['embedded.*'], key=my_key)
            ]
    # look for md5s on files without wfr_run_output (file_microscopy ...)
    else:
        if file_type not in ['files-fastq', 'files-processed']:
            wfrs_url = (
                '/search/?type=WorkflowRun&type=WorkflowRun&workflow.title=md5+0.2.6&workflow.title=md5+0.0.4'
                '&input_files.value.accession=') + file_resp['accession']
            wfrs = ff_utils.search_metadata(wfrs_url, key=my_key)
    # Skip sbg and file provenance
    wfrs = [i for i in wfrs if not i['@id'].startswith('/workflow-runs-sbg/')]
    wfrs = [
        i for i in wfrs
        if not i['display_title'].startswith('File Provenance Tracking')
    ]
    # CLEAN UP IF FILE IS DELETED
    if file_resp['status'] == 'deleted':
        if file_resp.get('quality_metric'):
            if delete:
                qc_uuid = file_resp['quality_metric']['uuid']
                ff_utils.delete_field(file_resp, 'quality_metric', key=my_key)
                # delete quality metrics object
                patch_data = {'status': "deleted"}
                ff_utils.patch_metadata(patch_data, obj_id=qc_uuid, key=my_key)
        # delete all workflows for deleted files
        if not wfrs:
            return
        else:
            wfr_report = get_wfr_report(wfrs)
            for wfr_to_del in wfr_report:
                if wfr_to_del['status'] != 'deleted':
                    if wfr_to_del['wfr_name'] not in workflow_names:
                        print('Unlisted Workflow', wfr_to_del['wfr_name'],
                              'deleted file workflow', wfr_to_del['wfr_uuid'],
                              file_resp['accession'])
                    ####################################################
                    # TEMPORARY PIECE##################################
                    if wfr_to_del['status'] == 'released to project':
                        print('saved from deletion', wfr_to_del['wfr_name'],
                              'deleted file workflow', wfr_to_del['wfr_uuid'],
                              file_resp['accession'])
                        return
                    if wfr_to_del['status'] == 'released':
                        print('delete released!!!!!', wfr_to_del['wfr_name'],
                              'deleted file workflow', wfr_to_del['wfr_uuid'],
                              file_resp['accession'])
                        return
                    #####################################################
                    print(wfr_to_del['wfr_name'], 'deleted file workflow',
                          wfr_to_del['wfr_uuid'], file_resp['accession'])
                    if delete:
                        patch_data = {
                            'description': "This workflow run is deleted",
                            'status': "deleted"
                        }
                        deleted_wfrs.append(wfr_to_del['wfr_uuid'])
                        ff_utils.patch_metadata(patch_data,
                                                obj_id=wfr_to_del['wfr_uuid'],
                                                key=my_key)
                        # delete output files of the deleted workflow run
                        if wfr_to_del['outputs']:
                            for out_file in wfr_to_del['outputs']:
                                ff_utils.patch_metadata({'status': "deleted"},
                                                        obj_id=out_file,
                                                        key=my_key)
                        if wfr_to_del.get('qcs'):
                            for out_qc in wfr_to_del['qcs']:
                                ff_utils.patch_metadata({'status': "deleted"},
                                                        obj_id=out_qc,
                                                        key=my_key)

    else:
        # get a report on all workflow_runs
        if not wfrs:
            return
        else:
            wfr_report = get_wfr_report(wfrs)
            # printTable(wfr_report, ['wfr_name', 'run_time', 'wfr_version', 'run_time', 'wfr_status'])
            # check if any unlisted wfr in report
            my_wfr_names = [i['wfr_name'] for i in wfr_report]
            unlisted = [x for x in my_wfr_names if x not in workflow_names]
            # report the unlisted ones
            if unlisted:
                print('Unlisted Workflow', unlisted, 'skipped in',
                      file_resp['accession'])
            for wf_name, accepted_rev, accepted_run_time in workflow_details:
                # for each type of worklow make a list of old ones, and patch status and description
                sub_wfrs = [i for i in wfr_report if i['wfr_name'] == wf_name]
                if sub_wfrs:
                    active_wfr = sub_wfrs[-1]
                    old_wfrs = sub_wfrs[:-1]
                    # check the status of the most recent workflow
                    if active_wfr['wfr_status'] != 'complete':
                        if (active_wfr['wfr_status'] in ['running', 'started']
                                and
                                active_wfr['run_time'] < accepted_run_time):
                            print(wf_name, 'still running for',
                                  file_resp['accession'])
                        else:
                            old_wfrs.append(active_wfr)
                    elif active_wfr['wfr_version'] not in accepted_rev:
                        old_wfrs.append(active_wfr)
                    if old_wfrs:
                        for wfr_to_del in old_wfrs:
                            if wfr_to_del['status'] != 'deleted':
                                if wfr_to_del['status'] in [
                                        'archived', 'replaced'
                                ]:
                                    print(wfr_to_del['wfr_name'],
                                          wfr_to_del['status'],
                                          ' wfr found, skipping ',
                                          wfr_to_del['wfr_uuid'],
                                          file_resp['accession'])
                                    continue
                                ####################################################
                                # TEMPORARY PIECE
                                if wfr_to_del[
                                        'status'] == 'released to project':
                                    print('saved from deletion',
                                          wfr_to_del['wfr_name'],
                                          'old style or dub',
                                          wfr_to_del['wfr_uuid'],
                                          file_resp['accession'])
                                    continue
                                if wfr_to_del['status'] == 'released':
                                    print('delete released????',
                                          wfr_to_del['wfr_name'],
                                          'old style or dub',
                                          wfr_to_del['wfr_uuid'],
                                          file_resp['accession'])
                                    continue
                                ####################################################

                                print(wfr_to_del['wfr_name'],
                                      'old style or dub',
                                      wfr_to_del['wfr_uuid'],
                                      file_resp['accession'])
                                if delete:
                                    patch_data = {
                                        'description':
                                        "This workflow run is deleted",
                                        'status': "deleted"
                                    }
                                    deleted_wfrs.append(wfr_to_del['wfr_uuid'])
                                    ff_utils.patch_metadata(
                                        patch_data,
                                        obj_id=wfr_to_del['wfr_uuid'],
                                        key=my_key)
                                    # delete output files of the deleted workflow run
                                    if wfr_to_del['outputs']:
                                        for out_file in wfr_to_del['outputs']:
                                            ff_utils.patch_metadata(
                                                {'status': "deleted"},
                                                obj_id=out_file,
                                                key=my_key)
                                    if wfr_to_del.get('qcs'):
                                        for out_qc in wfr_to_del['qcs']:
                                            ff_utils.patch_metadata(
                                                {'status': "deleted"},
                                                obj_id=out_qc,
                                                key=my_key)
    return deleted_wfrs

예제 #27

0

파일 보기

def main():
    """
    Use this command to update the inserts from a given fourfront env
    """
    logging.basicConfig()
    # Loading app will have configured from config file. Reconfigure here:
    logging.getLogger('encoded').setLevel(logging.DEBUG)

    parser = argparse.ArgumentParser(  # noqa - PyCharm wrongly thinks the formatter_class is specified wrong here.
        description="Update Inserts",
        epilog=EPILOG,
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    parser.add_argument('--env',
                        default='data',
                        help='FF environment to update from. Defaults to data')
    parser.add_argument('--dest',
                        default='temp-local-inserts',
                        help="destination file in inserts dir to write to")
    parser.add_argument(
        '--item-type',
        action='append',
        default=[],
        help="item type, e.g. file_fastq. Defaults to all types")
    parser.add_argument(
        '--ignore-field',
        action='append',
        default=[
            "submitted_by", "date_created", "last_modified", "schema_version"
        ],
        help='field name to ignore when running expand_es_metadata')
    parser.add_argument('--from-search',
                        help='query passed to search_metadata to find uuids')

    args = parser.parse_args()
    # this will work since bin/ commands are run from root FF directory
    inserts_location = 'src/encoded/tests/data'
    # hardcode these to prevent accidental creation of inserts files
    inserts_files = [
        'inserts', 'master-inserts', 'perf-testing', 'workbook-inserts',
        'temp-local-inserts'
    ]
    if args.dest not in inserts_files:
        raise Exception('Specified inserts destination %s must be one of: %s' %
                        (args.dest, inserts_files))
    inserts_path = '/'.join([inserts_location, args.dest])

    local_inserts, item_uuids = read_local_inserts_dir(args.dest, inserts_path,
                                                       args.item_type)

    # Used to preserve order of existing inserts in folder(s), if any.
    local_inserts_ordering_map = {}
    for item_type, local_inserts_for_type in local_inserts.items():
        for insrt_index, insrt_uuid in enumerate(local_inserts_for_type):
            # Duplicate insrt_indx between different item types are OK and present.
            # local_inserts_ordering_map is shallow.
            local_inserts_ordering_map[insrt_uuid] = insrt_index

    # add uuids from the input search result, if present
    if args.from_search:
        use_search = args.from_search
        # get frame=object search results to keep response small
        if 'frame=' not in use_search:
            use_search += '&frame=object'
        search_res = search_metadata(use_search, ff_env=args.env)
        search_uuids = [item['uuid'] for item in search_res]
        logger.info('update_inserts: Will update using %s items from search' %
                    len(search_uuids))
        item_uuids = list(set(item_uuids + search_uuids))

    # now find uuids and all linked from the given server
    svr_inserts, svr_uuids = expand_es_metadata(item_uuids,
                                                ff_env=args.env,
                                                store_frame='raw',
                                                add_pc_wfr=True,
                                                ignore_field=args.ignore_field)

    # if we are updating `inserts`, must make sure that items don't conflict
    # with those in `master-inserts`
    skip_uuids = set()
    if args.dest == 'inserts':
        master_path = '/'.join([inserts_location, 'master-inserts'])
        master_inserts, master_uuids = read_local_inserts_dir(
            'master-inserts', master_path)
        item_conflict_report = {}
        for item_type in svr_inserts:
            itype_err = []
            itype_okay = []
            conflicting_items = [
                item for item in svr_inserts[item_type]
                if item['uuid'] in master_uuids
            ]
            for conflict in conflicting_items:
                # compare inserts by loading json objects
                svr_json = json.dumps(conflict, sort_keys=True)
                mstr_json = json.dumps(
                    master_inserts[item_type][conflict['uuid']],
                    sort_keys=True)
                if svr_json != mstr_json:
                    itype_err.append(conflict['uuid'])
                else:
                    # the json is the same. Remove from the `inserts` update
                    skip_uuids.add(conflict['uuid'])
                    itype_okay.append(conflict['uuid'])
            item_conflict_report[item_type] = {
                'error': itype_err,
                'okay': itype_okay
            }
        if any([
                it for it in item_conflict_report
                if item_conflict_report[it]['error']
        ]):
            error_report = {
                it: item_conflict_report[it]['error']
                for it in item_conflict_report
            }
            logger.error(
                'update_inserts: Cannot update the following items in "inserts" directory,'
                ' since there are conflicting items with different values'
                'in the master-inserts. Update those first. Conflicts:\n%s' %
                json.dumps(error_report, indent=4))
            raise Exception(
                'Cannot load inserts as there are conflicting items in `master-inserts`'
            )
        elif any([
                it for it in item_conflict_report
                if item_conflict_report[it]['okay']
        ]):
            conflict_report = {
                it: item_conflict_report[it]['okay']
                for it in item_conflict_report
            }
            logger.warning(
                'update_inserts: The following items are already in "master-inserts".'
                ' Will not add to "inserts". Items:\n%s' %
                json.dumps(conflict_report, indent=4))

    # now we need to update the server inserts with contents from local inserts
    # so that existing information is not lost
    for item_type in svr_inserts:
        if skip_uuids:
            # remove items specified by skip uuids
            svr_inserts[item_type] = [
                insrt for insrt in svr_inserts[item_type]
                if insrt['uuid'] not in skip_uuids
            ]
        svr_inserts[item_type].sort(
            key=lambda insrt: local_inserts_ordering_map.get(
                insrt["uuid"], 99999))
        for item_uuid in local_inserts.get(item_type, {}):
            if item_uuid not in svr_uuids and item_uuid not in skip_uuids:
                svr_inserts[item_type].append(
                    local_inserts[item_type][item_uuid])

    dump_results_to_json(svr_inserts, inserts_path)
    logger.info('update_inserts: Successfully wrote to %s' % inserts_path)
    for item_type in svr_inserts:
        logger.info('update_inserts: Wrote %s items to %s' %
                    (len(svr_inserts[item_type]), item_type + '.json'))

예제 #28

0

파일 보기

파일: badge_checks.py 프로젝트: 4dn-dcic/foursight

def consistent_replicate_info(connection, **kwargs):
    '''
    Check for replicate experiment sets that have discrepancies in metadata between
    replicate experiments.

    Action patches badges with a message detailing which fields have the inconsistencies
    and what the inconsistent values are.
    '''
    check = CheckResult(connection, 'consistent_replicate_info')

    repset_url = 'search/?type=ExperimentSetReplicate&field=experiments_in_set.%40id&field=uuid&field=status&field=lab.display_title'
    exp_url = 'search/?type=Experiment&frame=object'
    bio_url = 'search/?type=Experiment&field=biosample'
    repsets = [item for item in ff_utils.search_metadata(repset_url, key=connection.ff_keys) if item.get('experiments_in_set')]
    exps = ff_utils.search_metadata(exp_url, key=connection.ff_keys)
    biosamples = ff_utils.search_metadata(bio_url, key=connection.ff_keys)
    exp_keys = {exp['@id']: exp for exp in exps}
    bio_keys = {bs['@id']: bs['biosample'] for bs in biosamples}
    fields2check = [
        'lab',
        'award',
        'experiment_type',
        'crosslinking_method',
        'crosslinking_time',
        'crosslinking_temperature',
        'digestion_enzyme',
        'enzyme_lot_number',
        'digestion_time',
        'digestion_temperature',
        'tagging_method',
        'tagging_rounds',
        'ligation_time',
        'ligation_temperature',
        'ligation_volume',
        'biotin_removed',
        'protocol',
        'protocol_variation',
        'follows_sop',
        'average_fragment_size',
        'fragment_size_range',
        'fragmentation_method',
        'fragment_size_selection_method',
        'rna_tag',
        'target_regions',
        'dna_label',
        'labeling_time',
        'antibody',
        'antibody_lot_id',
        'microscopy_technique',
        'imaging_paths',
    ]
    check.brief_output = {REV_KEY: {}, RELEASED_KEY: {
        'Add badge': {}, 'Remove badge': {}, 'Keep badge and edit messages': {}
    }}
    compare = {}
    results = {}
    for repset in repsets:
        info_dict = {}
        exp_list = [item['@id'] for item in repset['experiments_in_set']]
        for field in fields2check:
            vals = [stringify(exp_keys[exp].get(field)) for exp in exp_list]
            if field == 'average_fragment_size' and 'None' not in vals:
                int_vals = [int(val) for val in vals]
                if (max(int_vals) - min(int_vals))/(sum(int_vals)/len(int_vals)) < 0.25:
                    continue
            if len(set(vals)) > 1:
                info_dict[field] = vals
        for bfield in ['treatments_summary', 'modifications_summary']:
            bvals = [stringify(bio_keys[exp].get(bfield)) for exp in exp_list]
            if len(set(bvals)) > 1:
                info_dict[bfield] = bvals
        biosource_vals = [stringify([item['@id'] for item in bio_keys[exp].get('biosource')]) for exp in exp_list]
        if len(set(biosource_vals)) > 1:
            info_dict['biosource'] = biosource_vals
        if [True for exp in exp_list if bio_keys[exp].get('cell_culture_details')]:
            for ccfield in ['synchronization_stage', 'differentiation_stage', 'follows_sop']:
                ccvals = [stringify([item['@id'] for item in bio_keys[exp].get('cell_culture_details').get(ccfield)]) for exp in exp_list]
                if len(set(ccvals)) > 1:
                    info_dict[ccfield] = ccvals
        if [True for exp in exp_list if bio_keys[exp].get('biosample_protocols')]:
            bp_vals = [stringify([item['@id'] for item in bio_keys[exp].get('biosample_protocols', [])]) for exp in exp_list]
            if len(set(bp_vals)) > 1:
                info_dict['biosample_protocols'] = bp_vals
        if info_dict:
            info = sorted(['{}: {}'.format(k, stringify(v)) for k, v in info_dict.items()])
            #msg = 'Inconsistent replicate information in field(s) - ' + '; '.join(info)
            msgs = ['Inconsistent replicate information in ' + item for item in info]
            text = '{} - inconsistency in {}'.format(repset['@id'][-13:-1], ', '.join(list(info_dict.keys())))
            lab = repset['lab']['display_title']
            audit_key = REV_KEY if repset['status'] in REV else RELEASED_KEY
            results[repset['@id']] = {'status': audit_key, 'lab': lab, 'info': text}
            if audit_key == REV_KEY:
                if lab not in check.brief_output[audit_key]:
                    check.brief_output[audit_key][lab] = []
                check.brief_output[audit_key][lab].append(text)
            if repset['status'] not in REV:
                compare[repset['@id']] = msgs

    to_add, to_remove, to_edit, ok = compare_badges_and_messages(
        compare, 'ExperimentSetReplicate', 'inconsistent-replicate-info', connection.ff_keys
    )
    key_dict = {'Add badge': to_add, 'Remove badge': to_remove, 'Keep badge and edit messages': to_edit}
    for result in results.keys():
        for k, v in key_dict.items():
            if result in v.keys():
                if results[result]['lab'] not in check.brief_output[RELEASED_KEY][k].keys():
                    check.brief_output[RELEASED_KEY][k][results[result]['lab']] = []
                check.brief_output[RELEASED_KEY][k][results[result]['lab']].append(results[result]['info'])
                break
    check.brief_output[RELEASED_KEY]['Remove badge'] = list(to_remove.keys())
    if to_add or to_remove or to_edit:
        check.status = 'WARN'
        check.summary = 'Replicate Info badges need patching'
        check.description = ('{} ExperimentSetReplicates found that need a replicate-info badge patched'
                             ''.format(len(to_add.keys()) + len(to_remove.keys()) + len(to_edit.keys())))
    else:
        check.status = 'PASS'
        check.summary = 'Replicate Info badges are up-to-date'
        check.description = 'No ExperimentSetReplicates found that need a replicate-info badge patched'
    check.full_output = {'Add badge': to_add,
                         'Remove badge': to_remove,
                         'Keep badge and edit messages': to_edit,
                         'Keep badge (no change)': ok}
    check.action = 'patch_badges_for_inconsistent_replicate_info'
    if to_add or to_remove or to_edit:
        check.allow_action = True
    return check

예제 #29

0

파일 보기

def workflow_properties(connection, **kwargs):
    check = CheckResult(connection, 'workflow_properties')

    workflows = ff_utils.search_metadata(
        'search/?type=Workflow&category!=provenance&frame=object',
        key=connection.ff_keys)
    bad = {
        'Duplicate Input Names in Workflow Step': [],
        'Duplicate Output Names in Workflow Step': [],
        'Duplicate Input Source Names in Workflow Step': [],
        'Duplicate Output Target Names in Workflow Step': [],
        'Missing meta.file_format property in Workflow Step Input': [],
        'Missing meta.file_format property in Workflow Step Output': []
    }
    by_wf = {}
    for wf in workflows:
        # print(wf['@id'])
        issues = []
        for step in wf.get('steps'):
            # no duplicates in input names
            step_inputs = step.get('inputs')
            for step_input in step_inputs:
                if (step_input['meta'].get('type')
                        in ['data file', 'reference file']
                        and not step_input['meta'].get('file_format')):
                    issues.append(
                        'Missing meta.file_format property in Workflow Step `{}` Input `{}`'
                        ''.format(step.get('name'), step_input.get('name')))
            input_names = [
                step_input.get('name') for step_input in step_inputs
            ]
            if len(list(set(input_names))) != len(input_names):
                issues.append(
                    'Duplicate Input Names in Workflow Step {}'.format(
                        step.get('name')))
            # no duplicates in input source names
            sources = [(source.get('name'), source.get('step', "GLOBAL"))
                       for step_input in step_inputs
                       for source in step_input.get('source')]
            if len(sources) != len(list(set(sources))):
                issues.append(
                    'Duplicate Input Source Names in Workflow Step {}'.format(
                        step.get('name')))
            # no duplicates in output names
            step_outputs = step.get('outputs')
            for step_output in step_outputs:
                if (step_output['meta'].get('type')
                        in ['data file', 'reference file']
                        and not step_output['meta'].get('file_format')):
                    issues.append(
                        'Missing meta.file_format property in Workflow Step `{}` Output `{}`'
                        ''.format(step.get('name'), step_output.get('name')))
            output_names = [
                step_output.get('name') for step_output in step_outputs
            ]
            if len(list(set(output_names))) != len(output_names):
                issues.append(
                    'Duplicate Output Names in Workflow Step {}'.format(
                        step.get('name')))
            # no duplicates in output target names
            targets = [(target.get('name'), target.get('step', 'GLOBAL'))
                       for step_output in step_outputs
                       for target in step_output.get('target')]
            if len(targets) != len(list(set(targets))):
                issues.append(
                    'Duplicate Output Target Names in Workflow Step {}'.format(
                        step.get('name')))
        if not issues:
            continue
        errors = ' '.join(issues)
        if 'Duplicate Input Names' in errors:
            bad['Duplicate Input Names in Workflow Step'].append(wf['@id'])
        if 'Duplicate Output Names' in errors:
            bad['Duplicate Output Names in Workflow Step'].append(wf['@id'])
        if 'Duplicate Input Source Names' in errors:
            bad['Duplicate Input Source Names in Workflow Step'].append(
                wf['@id'])
        if 'Duplicate Output Target Names' in errors:
            bad['Duplicate Output Target Names in Workflow Step'].append(
                wf['@id'])
        if '` Input `' in errors:
            bad['Missing meta.file_format property in Workflow Step Input'].append(
                wf['@id'])
        if '` Output `' in errors:
            bad['Missing meta.file_format property in Workflow Step Output'].append(
                wf['@id'])
        by_wf[wf['@id']] = issues

    if by_wf:
        check.status = 'WARN'
        check.summary = 'Workflows found with issues in `steps`'
        check.description = (
            '{} workflows found with duplicate item names or missing fields'
            ' in `steps`'.format(len(by_wf.keys())))
    else:
        check.status = 'PASS'
        check.summary = 'No workflows with issues in `steps` field'
        check.description = (
            'No workflows found with duplicate item names or missing fields'
            ' in steps property')
    check.brief_output = bad
    check.full_output = by_wf
    return check

예제 #30

0

파일 보기

파일: wfr_encode_checks.py 프로젝트: 4dn-dcic/foursight

def chipseq_status(connection, **kwargs):
    """
    Keyword arguments:
    lab_title -- limit search with a lab i.e. Bing+Ren, UCSD
    start_date -- limit search to files generated since a date formatted YYYY-MM-DD
    run_time -- assume runs beyond run_time are dead
    """
    start = datetime.utcnow()
    check = CheckResult(connection, 'chipseq_status')
    my_auth = connection.ff_keys
    check.action = "chipseq_start"
    check.description = "run missing steps and add processing results to processed files, match set status"
    check.brief_output = []
    check.summary = ""
    check.full_output = {'skipped': [], 'running_runs': [], 'needs_runs': [],
                         'completed_runs': [], 'problematic_runs': []}
    check.status = 'PASS'
    exp_type = 'ChIP-seq'
    # completion tag
    tag = wfr_utils.accepted_versions[exp_type][-1]
    # check indexing queue
    check, skip = wfr_utils.check_indexing(check, connection)
    if skip:
        return check
    # Build the query, add date and lab if available
    query = wfr_utils.build_exp_type_query(exp_type, kwargs)
    res = ff_utils.search_metadata(query, key=my_auth)
    print(len(res))

    if not res:
        check.summary = 'All Good!'
        return check
    # run step 0 on all experiments with more than 2 sets of files
    # for control sets, run step1c on each experiment and finish
    # for non-control sets, run step1 on each experiment, check if control is ready, run step2 on set
    step0_name = 'merge-fastq'
    step1_name = 'encode-chipseq-aln-chip'
    step1c_name = 'encode-chipseq-aln-ctl'
    step2_name = 'encode-chipseq-postaln'

    for a_set in res:
        set_acc = a_set['accession']
        all_items, all_uuids = ff_utils.expand_es_metadata([a_set['uuid']], my_auth,
                                                           store_frame='embedded',
                                                           add_pc_wfr=True,
                                                           ignore_field=[  # 'experiment_relation',
                                                                         'biosample_relation',
                                                                         'references',
                                                                         'reference_pubs'])
        now = datetime.utcnow()
        print(a_set['accession'], (now-start).seconds, len(all_uuids))
        if (now-start).seconds > lambda_limit:
            break
        # are all files uploaded ?
        all_uploaded = True
        for a_file in all_items['file_fastq']:
            if a_file['status'] in ['uploading', 'upload failed']:
                all_uploaded = False

        if not all_uploaded:
            final_status = a_set['accession'] + ' skipped, waiting for file upload'
            print(final_status)
            check.brief_output.append(final_status)
            check.full_output['skipped'].append({a_set['accession']: 'files status uploading'})
            continue

        all_wfrs = all_items.get('workflow_run_awsem', []) + all_items.get('workflow_run_sbg', [])
        all_files = [i for typ in all_items for i in all_items[typ] if typ.startswith('file_')]
        all_qcs = [i for typ in all_items for i in all_items[typ] if typ.startswith('quality_metric')]
        library = {'wfrs': all_wfrs, 'files': all_files, 'qcs': all_qcs}
        keep = {'missing_run': [], 'running': [], 'problematic_run': []}
        # if all completed, patch this info
        complete = {'patch_opf': [],
                    'add_tag': []}
        set_acc = a_set['accession']

        # some feature to extract from each set
        control = ""  # True or False (True if set is control)
        control_set = ""  # None if there are no control experiments or if the set is control
        target_type = ""  # Histone or TF (or None for control)
        paired = ""  # single or paired , checked for each experiment
        organism = ""
        replicate_exps = a_set['replicate_exps']
        replicate_exps = sorted(replicate_exps, key=lambda x: [x['bio_rep_no'], x['tec_rep_no']])
        # get organism, target and control from the first replicate
        f_exp = replicate_exps[0]['replicate_exp']['uuid']
        # have to do another get for control experiments if there is one
        f_exp_resp = [i for i in all_items['experiment_seq'] if i['uuid'] == f_exp][0]
        control, control_set, target_type, organism = wfr_utils.get_chip_info(f_exp_resp, all_items)
        print('ORG:', organism, "CONT:", control, "TARGET:", target_type, "CONT_SET:", control_set)
        set_summary = " - ".join([set_acc, str(organism), str(target_type), str(control)])
        # sanity checks
        # if control and also has an AB with target
        if control and target_type:
            set_summary += "| error - has target and is control"
            check.brief_output.append(set_summary)
            check.full_output['skipped'].append({set_acc: set_summary})
            continue
        # can only process mouse and human at the moment
        if organism not in ['mouse', 'human']:
            set_summary += "| organism not ready for chip"
            check.brief_output.append(set_summary)
            check.full_output['skipped'].append({set_acc: set_summary})
            continue
        # if not control, we need a target
        if not control and not target_type:
            set_summary += "| missing target type"
            check.brief_output.append(set_summary)
            check.full_output['skipped'].append({set_acc: set_summary})
            continue
        # collect results from step1 runs for step2
        ta = []
        taxcor = []
        ta_cnt = []
        # track if all experiments completed step0 and step1
        ready_for_step2 = True
        for an_exp in replicate_exps:
            # track if all experiments completed step0
            ready_for_step1 = True
            # track if all control experiments are completed processing
            control_ready = True
            exp_id = an_exp['replicate_exp']['accession']
            exp_resp = [i for i in all_items['experiment_seq'] if i['accession'] == exp_id][0]
            exp_files, paired = wfr_utils.get_chip_files(exp_resp, all_files)
            # if there are more then 2 files, we need to merge:
            print(exp_id, len(exp_files), paired)
            # if too many input, merge them
            if len(exp_files) > 2:
                # exp_files format [[pair1,pair2], [pair1, pair2]]  @id
                input_list = []
                if paired == 'paired':
                    # first add paired end 1s
                    input_list.append([i[0] for i in exp_files])
                    input_list.append([i[1] for i in exp_files])
                elif paired == 'single':
                    input_list.append([i[0] for i in exp_files])
                # collect files for step1 and step1c
                merged_files = []
                step0_status = 'complete'
                merge_enum = 0
                # if paired, need to run merge twice for each end
                for merge_case in input_list:
                    merge_enum += 1
                    # RUN STEP 0
                    s0_input_files = {'input_fastqs': merge_case}
                    s0_tag = exp_id + '_p' + str(merge_enum)
                    keep, step0_status, step0_output = wfr_utils.stepper(library, keep,
                                                                         'step0', s0_tag, merge_case,
                                                                         s0_input_files, step0_name, 'merged_fastq', organism=organism)
                    if step0_status == 'complete':
                        merged_files.append(step0_output)
                    else:
                        ready_for_step1 = False

                if ready_for_step1:
                    # rewrite exp_files with merged ones
                    exp_files = [[]]
                    for a_merged in merged_files:
                        exp_files[0].append(a_merged)
            # if step0 was not complete, skip checks for step2
            if not ready_for_step1:
                ready_for_step2 = False
                continue

            # step1 references:
            input_files = {}
            if organism == 'human':
                org = 'hs'
                input_files['chip.bwa_idx_tar'] = '/files-reference/4DNFIZQB369V/'
                input_files['chip.blacklist'] = '/files-reference/4DNFIZ1TGJZR/'
                input_files['chip.chrsz'] = '/files-reference/4DNFIZJB62D1/'
                input_files['additional_file_parameters'] = {"chip.bwa_idx_tar": {"rename": "GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.tar"}}
            if organism == 'mouse':
                org = 'mm'
                input_files['chip.bwa_idx_tar'] = '/files-reference/4DNFIZ2PWCC2/'
                input_files['chip.blacklist'] = '/files-reference/4DNFIZ3FBPK8/'
                input_files['chip.chrsz'] = '/files-reference/4DNFIBP173GC/'
                input_files['additional_file_parameters'] = {"chip.bwa_idx_tar": {"rename": "mm10_no_alt_analysis_set_ENCODE.fasta.tar"}}
            # step1 Parameters
            parameters = {}
            parameters["chip.gensz"] = org
            if paired == 'single':
                frag_temp = [300]
                fraglist = frag_temp * len(exp_files)
                parameters['chip.fraglen'] = fraglist
                parameters['chip.paired_end'] = False
            elif paired == 'paired':
                parameters['chip.paired_end'] = True

            # run step1 for control
            if control:
                # control run on tf mode
                # input_files = {'chip.ctl_fastqs': [exp_files]}
                input_files['chip.ctl_fastqs'] = [exp_files]
                control_parameters = {
                    "chip.pipeline_type": 'tf',
                    "chip.choose_ctl.always_use_pooled_ctl": True,
                    "chip.bam2ta_ctl.regex_grep_v_ta": "chr[MUE]|random|alt",
                    "chip.bwa_ctl.cpu": 8,
                    "chip.merge_fastq_ctl.cpu": 8,
                    "chip.filter_ctl.cpu": 8,
                    "chip.bam2ta_ctl.cpu": 8,
                    "chip.align_only": True
                }
                parameters.update(control_parameters)

                s1c_input_files = input_files
                s1c_tag = exp_id
                keep, step1c_status, step1c_output = wfr_utils.stepper(library, keep,
                                                                       'step1c', s1c_tag, exp_files,
                                                                       s1c_input_files, step1c_name, 'chip.first_ta_ctl',
                                                                       additional_input={'parameters': parameters}, organism=organism)
                if step1c_status == 'complete':
                    # accumulate files to patch on experiment
                    patch_data = [step1c_output, ]
                    complete['patch_opf'].append([exp_id, patch_data])
                else:
                    # don't patch anything if at least one exp is still missing
                    ready_for_step2 = False
                print('step1c')
                print(step1c_status, step1c_output)

            # run step1
            else:
                # input_files = {'chip.fastqs': [exp_files]}
                input_files['chip.fastqs'] = [exp_files]
                exp_parameters = {
                    "chip.pipeline_type": target_type,
                    "chip.choose_ctl.always_use_pooled_ctl": True,
                    "chip.bam2ta.regex_grep_v_ta": "chr[MUE]|random|alt",
                    "chip.bwa.cpu": 8,
                    "chip.merge_fastq.cpu": 8,
                    "chip.filter.cpu": 8,
                    "chip.bam2ta.cpu": 8,
                    "chip.xcor.cpu": 8,
                    "chip.align_only": True
                }
                parameters.update(exp_parameters)

                s1_input_files = input_files
                s1_tag = exp_id
                # if complete, step1_output will have a list of 2 files, first_ta, and fist_ta_xcor
                keep, step1_status, step1_output = wfr_utils.stepper(library, keep,
                                                                     'step1', s1_tag, exp_files,
                                                                     s1_input_files, step1_name, ['chip.first_ta', 'chip.first_ta_xcor'],
                                                                     additional_input={'parameters': parameters}, organism=organism)
                if step1_status == 'complete':
                    exp_ta_file = step1_output[0]
                    exp_taxcor_file = step1_output[1]
                    # accumulate files to patch on experiment
                    patch_data = [exp_ta_file, ]
                    complete['patch_opf'].append([exp_id, patch_data])
                    ta.append(exp_ta_file)
                    taxcor.append(exp_taxcor_file)

                    # find the control file if there is a control set found
                    if control_set:
                        try:
                            exp_cnt_ids = [i['experiment'] for i in exp_resp['experiment_relation'] if i['relationship_type'] == 'controlled by']
                            exp_cnt_ids = [i['@id'] for i in exp_cnt_ids]
                        except:
                            control_ready = False
                            print('Control Relation has problems for this exp', exp_id)
                            continue
                        if len(exp_cnt_ids) != 1:
                            control_ready = False
                            print('Multiple controls for this exp', exp_id)
                            continue
                        exp_cnt_id = exp_cnt_ids[0]
                        print('controled by set', exp_cnt_id)
                        # have to do a get for the control experiment
                        exp_cnt_resp = [i for i in all_items['experiment_seq'] if i['@id'] == exp_cnt_id][0]
                        cont_file = ''
                        # check opf for control file
                        for opf_case in exp_cnt_resp.get('other_processed_files', []):
                            if opf_case['title'] == 'ENCODE ChIP-Seq Pipeline - Preliminary Files':
                                opf_files = opf_case['files']
                                assert len(opf_files) == 1
                                cont_file = opf_files[0]['@id']
                        # if not in opf, check processed files
                        if not cont_file:
                            pf_list = exp_cnt_resp.get('processed_files', [])
                            if pf_list:
                                if pf_list:
                                    assert len(pf_list) == 1
                                    cont_file = pf_list[0]['@id']
                        # did we find it, if so, add it to ta_cnt
                        if cont_file:
                            ta_cnt.append(cont_file)
                        else:
                            control_ready = False

                else:
                    # don't patch anything if at least one exp is still missing
                    ready_for_step2 = False
                print('step1')
                print(step1_status, step1_output, control_ready)
        # back to set level
        final_status = set_acc  # start the reporting with acc
        all_completed = False
        # is step0 step1 complete
        if ready_for_step2 and not control_ready:
            final_status += ' waiting for control experiments to finish processing'
        elif ready_for_step2:
            # for control, add tag to set, and files to experiments
            if control:
                complete['add_tag'] = [set_acc, tag]
            # for non controls check for step2
            else:
                # this only works with 2 experiments, if 3, pick best 2, if more, skip for now
                if len(ta) > 3:
                    set_summary += "| skipped - more then 3 experiments in set, can not process at the moment"
                    check.brief_output.append(set_summary)
                    check.full_output['skipped'].append({set_acc: set_summary})
                    continue
                if len(ta) > 2:
                    ta_2 = []
                    taxcor_2 = []
                    print('ExperimentSet has 3 experiments, selecting best 2')
                    ta_2 = wfr_utils.select_best_2(ta, all_files, all_qcs)
                    # xcor does not have qc, use ta indexes to find the correct files
                    for ta_f in ta_2:
                        taxcor_2.append(taxcor[ta.index(ta_f)])
                    ta = ta_2
                    taxcor = taxcor_2
                    # for control files ,also select best2
                    ta_cnt = wfr_utils.select_best_2(ta_cnt, all_files, all_qcs)

                # collect step2 input files
                s2_input_files = {}
                if organism == 'human':
                    org = 'hs'
                    s2_input_files['chip.blacklist'] = '/files-reference/4DNFIZ1TGJZR/'
                    s2_input_files['chip.chrsz'] = '/files-reference/4DNFIZJB62D1/'
                if organism == 'mouse':
                    org = 'mm'
                    s2_input_files['chip.blacklist'] = '/files-reference/4DNFIZ3FBPK8/'
                    s2_input_files['chip.chrsz'] = '/files-reference/4DNFIBP173GC/'

                def rename_chip(input_at_id_list):
                    # rename bed.gz to tagAlign.gz
                    renamed = []
                    for a_file in input_at_id_list:
                        acc = a_file.split('/')[2]
                        renamed.append(acc + '.tagAlign.gz')
                    return renamed

                s2_input_files['additional_file_parameters'] = {}
                s2_input_files['chip.tas'] = ta
                s2_input_files['additional_file_parameters']['chip.tas'] = {"rename": rename_chip(ta)}
                s2_input_files['chip.bam2ta_no_filt_R1.ta'] = taxcor
                s2_input_files['additional_file_parameters']['chip.bam2ta_no_filt_R1.ta'] = {"rename": rename_chip(taxcor)}
                if ta_cnt:
                    s2_input_files['chip.ctl_tas'] = ta_cnt
                    s2_input_files['additional_file_parameters']['chip.ctl_tas'] = {"rename": rename_chip(ta_cnt)}

                # collect parameters
                parameters = {}
                if paired == 'single':
                    chip_p = False
                elif paired == 'paired':
                    chip_p = True
                if not control_set:
                    if target_type == 'histone':
                        set_summary += "| skipped - histone without control needs attention, ie change to tf"
                        check.brief_output.append(set_summary)
                        check.full_output['skipped'].append({set_acc: set_summary})
                        continue
                run_ids = {'desc': set_acc + a_set.get('description', '')}
                parameters = {
                    "chip.pipeline_type": target_type,
                    "chip.paired_end": chip_p,
                    "chip.choose_ctl.always_use_pooled_ctl": True,
                    "chip.qc_report.desc": run_ids['desc'],
                    "chip.gensz": org,
                    "chip.xcor.cpu": 4,
                }
                if paired == 'single':
                    frag_temp = [300]
                    fraglist = frag_temp * len(ta)
                    parameters['chip.fraglen'] = fraglist

                # if the target is a tf and there is no control, use macs2
                if not control_set:
                    if target_type == 'tf':
                        parameters['chip.peak_caller'] = "macs2"

                s2_tag = set_acc
                # if complete, step1_output will have a list of 2 files, first_ta, and fist_ta_xcor
                keep, step2_status, step2_output = wfr_utils.stepper(library, keep,
                                                                     'step2', s2_tag, ta,
                                                                     s2_input_files, step2_name,
                                                                     ['chip.optimal_peak', 'chip.conservative_peak', 'chip.sig_fc'],
                                                                     additional_input={'parameters': parameters}, organism=organism)
                if step2_status == 'complete':
                    set_opt_peak = step2_output[0]
                    set_cons_peak = step2_output[1]
                    set_sig_fc = step2_output[2]
                    # accumulate files to patch on experiment
                    patch_data = [set_opt_peak, set_cons_peak, set_sig_fc]
                    complete['patch_opf'].append([set_acc, patch_data])
                    complete['add_tag'] = [set_acc, tag]
                    all_completed = True

        # unpack results
        missing_run = keep['missing_run']
        running = keep['running']
        problematic_run = keep['problematic_run']
        if all_completed:
            final_status += ' completed'
        else:
            if missing_run:
                final_status += ' |Missing: ' + " ".join([i[0] for i in missing_run])
            if running:
                final_status += ' |Running: ' + " ".join([i[0] for i in running])
            if problematic_run:
                final_status += ' |Problem: ' + " ".join([i[0] for i in problematic_run])

        # add dictionaries to main ones
        check.brief_output.append(final_status)
        print(final_status)
        if running:
            check.full_output['running_runs'].append({set_acc: running})
        if missing_run:
            check.full_output['needs_runs'].append({set_acc: missing_run})
        if problematic_run:
            check.full_output['problematic_runs'].append({set_acc: problematic_run})
        # if made it till the end
        if complete.get('add_tag'):
            assert not running
            assert not problematic_run
            assert not missing_run
            check.full_output['completed_runs'].append(complete)

    # complete check values
    check.summary = ""
    if check.full_output['running_runs']:
        check.summary = str(len(check.full_output['running_runs'])) + ' running|'
    if check.full_output['skipped']:
        check.summary += str(len(check.full_output['skipped'])) + ' skipped|'
        check.status = 'WARN'
    if check.full_output['needs_runs']:
        check.summary += str(len(check.full_output['needs_runs'])) + ' missing|'
        check.status = 'WARN'
        check.allow_action = True
    if check.full_output['completed_runs']:
        check.summary += str(len(check.full_output['completed_runs'])) + ' completed|'
        check.status = 'WARN'
        check.allow_action = True
    if check.full_output['problematic_runs']:
        check.summary += str(len(check.full_output['problematic_runs'])) + ' problem|'
        check.status = 'WARN'
    return check

예제 #31

0

파일 보기

data_dir = os.environ['DATA_VOLUME_CONTAINER']
c.JupyterHub.cookie_secret_file = os.path.join(data_dir,
                                               'jupyterhub_cookie_secret')
c.JupyterHub.db_url = os.path.join(data_dir, 'jupyterhub.sqlite')

# Whitlelist users and admins
c.Authenticator.whitelist = whitelist = set()
c.Authenticator.admin_users = admin = set()
c.JupyterHub.admin_access = True
# comma-separated admin emails, lowercased
admin_emails = [
    email.strip().lower()
    for email in os.environ.get('ADMIN_EMAILS', '').split(',')
]
ff_users = ff_utils.search_metadata('search/?type=User&field=email',
                                    key=ff_keys)
for ff_user in ff_users:
    if not ff_user.get('email'):
        continue
    whitelist.add(ff_user['email'])
    # base admin off of a set environment variable, for now
    if ff_user['email'].lower() in admin_emails:
        admin.add(ff_user['email'])

# add API token to the instance. Use the **only** the first admin email
if admin_emails:
    c.JupyterHub.api_tokens = {
        jh_token['secret']: admin_emails[0],
    }

# set up services

예제 #32

0

파일 보기

파일: badge_checks.py 프로젝트: 4dn-dcic/foursight

def yellow_flag_biosamples(connection, **kwargs):
    '''
    Checks biosamples for required metadata:
    1. Culture harvest date, doubling number, passage number, culture duration
    2. Morphology image
    3. Karyotyping (authentication doc or string field) for any biosample derived
    from pluripotent cell line that has been passaged more than 10 times beyond
    the first thaw of the original vial.
    4. Differentiation authentication for differentiated cells.
    5. HAP-1 biosamples must have ploidy authentication.
    6. For phase 2 samples must include FBS info (post 2022-05-10)
    '''
    check = CheckResult(connection, 'yellow_flag_biosamples')

    results = ff_utils.search_metadata('search/?type=Biosample', key=connection.ff_keys)
    flagged = {}
    check.brief_output = {RELEASED_KEY: {}, REV_KEY: []}

    fbs_chk_date = '2022-05-10'
    for result in results:
        messages = []
        bs_types = [bs.get('biosource_type') for bs in result.get('biosource', [])]
        karyotype = False
        diff_auth = False
        ploidy = False
        bccs = result.get('cell_culture_details', [])
        if not bccs:
            if len([t for t in bs_types if t in ['primary cell', 'tissue', 'multicellular organism']]) != len(bs_types):
                messages.append('Biosample missing Cell Culture Details')
        else:
            tier = re.search(r'\(Tier (1|2)\)', result.get('biosource_summary'))
            for bcc in bccs:
                for item in [
                    'culture_harvest_date', 'doubling_number', 'passage_number', 'culture_duration', 'morphology_image'
                ]:
                    if not bcc.get(item):
                        messages.append('Biosample missing {}'.format(item))
                if bcc.get('karyotype'):
                    karyotype = True
                for protocol in bcc.get('authentication_protocols', []):
                    protocol_item = ff_utils.get_metadata(protocol['@id'], key=connection.ff_keys)
                    auth_type = protocol_item.get('protocol_classification')
                    if not karyotype and auth_type == 'Karyotype Authentication':
                        karyotype = True
                    elif auth_type == 'Differentiation Authentication':
                        diff_auth = True
                    elif auth_type == 'Ploidy Authentication':
                        ploidy = True
                passages = bcc.get('passage_number', 0)
                if 'tem cell' in ''.join(bs_types) and not karyotype:
                    if passages > 10:
                        messages.append('Biosample is a stem cell line over 10 passages but missing karyotype')
                    elif not passages:
                        messages.append('Biosample is a stem cell line with unknown passage number missing karyotype')
                if tier and bcc.get('culture_start_date', '2000-01-01') > fbs_chk_date:
                    valid_fbs = ["VWR 97068-091 Lot 035B15 (phase 1)", "Peak Serum PS-FBS2 Lot 21E1202 (phase 2)", "VWR 89510-184 lot 310B19 (phase 2)"]
                    fbs_info = bcc.get('fbs_vendor_lot', '').strip()
                    if fbs_info not in valid_fbs:
                        messages.append('Tiered cell line cultured after {} missing 4DN specified FBS vendor and lot info'.format(fbs_chk_date))
        if result.get('biosample_type') == 'In vitro differentiated cells' and not diff_auth:
            messages.append('Differentiated biosample missing differentiation authentication')
        if 'HAP-1' in result.get('biosource_summary') and not ploidy:
            messages.append('HAP-1 biosample missing ploidy authentication')
        if messages:
            messages = [messages[i] for i in range(len(messages)) if messages[i] not in messages[:i]]
            if result.get('status') in REV:
                check.brief_output[REV_KEY].append('{} missing {}'.format(
                    result['@id'], ', '.join(list(set([item[item.index('missing') + 8:] for item in messages])))
                ))
            else:
                flagged[result['@id']] = messages

    to_add, to_remove, to_edit, ok = compare_badges_and_messages(
        flagged, 'Biosample', 'biosample-metadata-incomplete', connection.ff_keys
    )
    check.action = 'patch_biosample_warning_badges'
    if to_add or to_remove or to_edit:
        check.status = 'WARN'
        check.summary = 'Yellow flag biosample badges need patching'
        check.description = '{} biosamples need warning badges patched'.format(
            len(to_add.values()) + len(to_remove.values()) + len(to_edit.values())
        )
        check.allow_action = True
    else:
        check.status = 'PASS'
        check.summary = 'Yellow flag biosample badges up-to-date'
        check.description = 'No yellow flag biosample badges need patching'
    check.full_output = {'Add badge': to_add,
                         'Remove badge': to_remove,
                         'Keep badge and edit messages': to_edit,
                         'Keep badge (no change)': ok}
    check.brief_output[RELEASED_KEY] = {
        'Add badge': ['{} missing {}'.format(
            k, ', '.join([item[item.index('missing') + 8:] for item in flagged[k]])
        ) for k in to_add.keys()],
        'Remove badge': list(to_remove.keys()),
        'Keep badge and edit messages': ['{} missing {}'.format(
            k, ', '.join([item[item.index('missing') + 8:] for item in flagged[k]])
        ) for k in to_edit.keys()]
    }
    return check

예제 #33

0

파일 보기

파일: wfr_encode_checks.py 프로젝트: 4dn-dcic/foursight

def atacseq_status(connection, **kwargs):
    """
    Keyword arguments:
    lab_title -- limit search with a lab i.e. Bing+Ren, UCSD
    start_date -- limit search to files generated since a date formatted YYYY-MM-DD
    run_time -- assume runs beyond run_time are dead
    pick_best_2 -- False by default. If set the True, for sets more than 2 experiments,
                   2 best will be used instead of running mergebed
    """
    start = datetime.utcnow()
    check = CheckResult(connection, 'atacseq_status')
    my_auth = connection.ff_keys
    check.action = "atacseq_start"
    check.description = "run missing steps and add processing results to processed files, match set status"
    check.brief_output = []
    check.summary = ""
    check.full_output = {'skipped': [], 'running_runs': [], 'needs_runs': [],
                         'completed_runs': [], 'problematic_runs': []}
    check.status = 'PASS'
    exp_type = 'ATAC-seq'
    # completion tag
    tag = wfr_utils.accepted_versions[exp_type][-1]
    pick_best_2 = kwargs.get('pick_best_2', False)
    # check indexing queue
    check, skip = wfr_utils.check_indexing(check, connection)
    if skip:
        return check
    # Build the query, add date and lab if available
    query = wfr_utils.build_exp_type_query(exp_type, kwargs)
    res = ff_utils.search_metadata(query, key=my_auth)
    print(len(res))

    if not res:
        check.summary = 'All Good!'
        return check
    # run step 0 on all experiments with more than 2 sets of files
    # step1 on each experiment,if multiple exps, merge beds, run step3 on set
    step0_name = 'merge-fastq'
    step1_name = 'encode-atacseq-aln'
    step2_name = 'mergebed'
    step3_name = 'encode-atacseq-postaln'

    for a_set in res:
        set_acc = a_set['accession']
        all_items, all_uuids = ff_utils.expand_es_metadata([a_set['uuid']], my_auth,
                                                           store_frame='embedded',
                                                           add_pc_wfr=True,
                                                           ignore_field=['experiment_relation',
                                                                         'biosample_relation',
                                                                         'references',
                                                                         'reference_pubs'])
        now = datetime.utcnow()
        print(a_set['accession'], (now-start).seconds, len(all_uuids))
        if (now-start).seconds > lambda_limit:
            break
        # are all files uploaded ?
        all_uploaded = True
        for a_file in all_items['file_fastq']:
            if a_file['status'] in ['uploading', 'upload failed']:
                all_uploaded = False
        if not all_uploaded:
            final_status = a_set['accession'] + ' skipped, waiting for file upload'
            print(final_status)
            check.brief_output.append(final_status)
            check.full_output['skipped'].append({a_set['accession']: 'files status uploading'})
            continue
        all_wfrs = all_items.get('workflow_run_awsem', []) + all_items.get('workflow_run_sbg', [])
        all_files = [i for typ in all_items for i in all_items[typ] if typ.startswith('file_')]
        all_qcs = [i for typ in all_items for i in all_items[typ] if typ.startswith('quality_metric')]
        library = {'wfrs': all_wfrs, 'files': all_files, 'qcs': all_qcs}
        keep = {'missing_run': [], 'running': [], 'problematic_run': []}
        # if all completed, patch this info
        complete = {'patch_opf': [],
                    'add_tag': []}
        set_acc = a_set['accession']

        # some feature to extract from each set
        paired = ""  # single or paired , checked for each experiment
        organism = ""
        replicate_exps = a_set['replicate_exps']
        replicate_exps = sorted(replicate_exps, key=lambda x: [x['bio_rep_no'], x['tec_rep_no']])
        # get organism
        f_exp = replicate_exps[0]['replicate_exp']['uuid']
        # have to do another get for control experiments if there is one
        f_exp_resp = [i for i in all_items['experiment_atacseq'] if i['uuid'] == f_exp][0]
        biosample = f_exp_resp['biosample']
        organism = list(set([bs['organism']['name'] for bs in biosample['biosource']]))[0]
        set_summary = " - ".join([set_acc, str(organism)])
        print(set_summary)
        # sanity checks
        # can only process mouse and human at the moment
        if organism not in ['mouse', 'human']:
            set_summary += "| organism not ready for atac"
            check.brief_output.append(set_summary)
            check.full_output['skipped'].append({set_acc: set_summary})
            continue

        # collect results from step1 runs for step2
        ta = []
        # track if all experiments completed step0 and step1
        ready_for_step2 = True
        for an_exp in replicate_exps:
            # track if all experiments completed step0
            ready_for_step1 = True
            exp_id = an_exp['replicate_exp']['accession']
            exp_resp = [i for i in all_items['experiment_atacseq'] if i['accession'] == exp_id][0]
            # exp_files [[pair1,pair2], [pair1, pair2]]
            exp_files, paired = wfr_utils.get_chip_files(exp_resp, all_files)
            # if there are more then 2 files, we need to merge:
            print(exp_id, len(exp_files), paired)
            # if too many input, merge them
            if len(exp_files) > 2:
                # exp_files format [[pair1,pair2], [pair1, pair2]]  @id
                input_list = []
                if paired == 'paired':
                    # first add paired end 1s
                    input_list.append([i[0] for i in exp_files])
                    input_list.append([i[1] for i in exp_files])
                elif paired == 'single':
                    input_list.append([i[0] for i in exp_files])
                # collect files for step1 and step1c
                merged_files = []
                step0_status = 'complete'
                merge_enum = 0
                # if paired, need to run merge twice for each end
                for merge_case in input_list:
                    merge_enum += 1
                    # RUN STEP 0
                    s0_input_files = {'input_fastqs': merge_case}
                    s0_tag = exp_id + '_p' + str(merge_enum)
                    keep, step0_status, step0_output = wfr_utils.stepper(library, keep,
                                                                         'step0', s0_tag, merge_case,
                                                                         s0_input_files, step0_name, 'merged_fastq')
                    if step0_status == 'complete':
                        merged_files.append(step0_output)
                    else:
                        ready_for_step1 = False

                if ready_for_step1:
                    # rewrite exp_files with merged ones
                    exp_files = [[]]
                    for a_merged in merged_files:
                        exp_files[0].append(a_merged)
            # if step0 was not complete, skip checks for step2
            if not ready_for_step1:
                ready_for_step2 = False
                continue

            # step1 files
            # references
            input_files = {}
            if organism == 'human':
                org = 'hs'
                input_files['atac.bowtie2_idx_tar'] = '/files-reference/4DNFIMQPTYDY/'
                input_files['atac.blacklist'] = '/files-reference/4DNFIZ1TGJZR/'
                input_files['atac.chrsz'] = '/files-reference/4DNFIZJB62D1/'
                input_files['additional_file_parameters'] = {"atac.bowtie2_idx_tar": {"rename": "GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.tar"}}
            if organism == 'mouse':
                org = 'mm'
                input_files['atac.bowtie2_idx_tar'] = '/files-reference/4DNFI2493SDN/'
                input_files['atac.blacklist'] = '/files-reference/4DNFIZ3FBPK8/'
                input_files['atac.chrsz'] = '/files-reference/4DNFIBP173GC/'
                input_files['additional_file_parameters'] = {"atac.bowtie2_idx_tar": {"rename": "mm10_no_alt_analysis_set_ENCODE.fasta.tar"}}
            # add input files
            input_files['atac.fastqs'] = [exp_files]
            # step1 Parameters
            parameters = {
                "atac.pipeline_type": 'atac',
                "atac.gensz": org,
                "atac.bam2ta.regex_grep_v_ta": "chr[MUE]|random|alt",
                "atac.disable_ataqc": True,
                "atac.enable_xcor": False,
                "atac.trim_adapter.auto_detect_adapter": True,
                "atac.bowtie2.cpu": 4,
                "atac.filter.cpu": 4,
                "atac.bam2ta.cpu": 4,
                "atac.trim_adapter.cpu": 4,
                "atac.align_only": True
            }
            if paired == 'single':
                frag_temp = [300]
                fraglist = frag_temp * len(exp_files)
                parameters['atac.fraglen'] = fraglist
                parameters['atac.paired_end'] = False
            elif paired == 'paired':
                parameters['atac.paired_end'] = True

            s1_input_files = input_files
            s1_tag = exp_id
            # if complete, step1_output will have a list of 2 files, first_ta, and fist_ta_xcor
            keep, step1_status, step1_output = wfr_utils.stepper(library, keep,
                                                                 'step1', s1_tag, exp_files,
                                                                 s1_input_files, step1_name, 'atac.first_ta',
                                                                 additional_input={'parameters': parameters})
            if step1_status == 'complete':
                # accumulate files to patch on experiment
                patch_data = [step1_output, ]
                complete['patch_opf'].append([exp_id, patch_data])
                ta.append(step1_output)
            else:
                # don't patch anything if at least one exp is still missing
                ready_for_step2 = False
            print('step1', step1_status, step1_output)

        # back to set level
        final_status = set_acc  # start the reporting with acc
        all_completed = False
        # is step0 step1 complete
        if ready_for_step2:
            # Following was the proposed logic, but it is not implemented
            # Currently, for sets with more than 2 experiments, there are 2 options
            # 1) pick best 2,   2) run mergebed (default)

            # Proposed logic
            # if there are more then 2 experiments, check the number of biological replicates
            # if there is 1 Biological Replicate
            # -pick best 2 exp
            # if there are 2 Biological replicates
            #  - run mergebed on bioreps with more then 1 technical replicate
            # if there are 3 Biological replicates
            # - if there are 3 total experiments (1 in each biological rep), pick best 2
            # - else, run mergebed on bioreps with more then 1 technical replicate, and pick best 2 biorep
            # if there are 4 or more Biolofical replicates
            # - run mergebed on bioreps with more then 1 technical replicate, and pick best 2 biorep
            # this only works with 2 experiments, if 3, pick best 2, if more, skip for now
            ready_for_step3 = True
            if len(ta) > 2:
                if pick_best_2:
                    # pick best 2 - False by default
                    print('ExperimentSet has 3 experiments, selecting best 2')
                    ta = wfr_utils.select_best_2(ta, all_files, all_qcs)
                else:
                    # run mergebed - default option
                    s2_input_files = {'input_bed': ta}
                    s2_tag = set_acc
                    # if complete, step1_output will have a list of 2 files, first_ta, and fist_ta_xcor
                    keep, step2_status, step2_output = wfr_utils.stepper(library, keep,
                                                                         'step2', s2_tag, ta,
                                                                         s2_input_files, step2_name, 'merged_bed')
                    if step2_status == 'complete':
                        ta = [step2_output, ]
                    else:
                        ready_for_step3 = False
            if ready_for_step3:
                # collect step3 input files
                s3_input_files = {}
                if organism == 'human':
                    org = 'hs'
                    s3_input_files['atac.blacklist'] = '/files-reference/4DNFIZ1TGJZR/'
                    s3_input_files['atac.chrsz'] = '/files-reference/4DNFIZJB62D1/'
                if organism == 'mouse':
                    org = 'mm'
                    s3_input_files['atac.blacklist'] = '/files-reference/4DNFIZ3FBPK8/'
                    s3_input_files['atac.chrsz'] = '/files-reference/4DNFIBP173GC/'

                def rename_chip(input_at_id_list):
                    # rename bed.gz to tagAlign.gz
                    renamed = []
                    for a_file in input_at_id_list:
                        acc = a_file.split('/')[2]
                        renamed.append(acc + '.tagAlign.gz')
                    return renamed

                s3_input_files['additional_file_parameters'] = {}
                s3_input_files['atac.tas'] = ta
                s3_input_files['additional_file_parameters']['chip.tas'] = {"rename": rename_chip(ta)}
                # collect parameters
                if paired == 'single':
                    chip_p = False
                elif paired == 'paired':
                    chip_p = True
                parameters = {
                    "atac.pipeline_type": 'atac',
                    "atac.paired_end": chip_p,
                    "atac.gensz": org,
                    "atac.disable_ataqc": True,
                    "atac.enable_xcor": False,
                }
                if paired == 'single':
                    frag_temp = [300]
                    fraglist = frag_temp * len(ta)
                    parameters['atac.fraglen'] = fraglist

                s3_tag = set_acc
                # if complete, step1_output will have a list of 2 files, first_ta, and fist_ta_xcor
                keep, step3_status, step3_output = wfr_utils.stepper(library, keep,
                                                                     'step3', s3_tag, ta,
                                                                     s3_input_files, step3_name,
                                                                     ['atac.optimal_peak', 'atac.conservative_peak', 'atac.sig_fc'],
                                                                     additional_input={'parameters': parameters})
                if step3_status == 'complete':
                    set_opt_peak = step3_output[0]
                    set_cons_peak = step3_output[1]
                    set_sig_fc = step3_output[2]
                    # accumulate files to patch on experiment
                    patch_data = [set_opt_peak, set_cons_peak, set_sig_fc]
                    complete['patch_opf'].append([set_acc, patch_data])
                    complete['add_tag'] = [set_acc, tag]
                    all_completed = True

        # unpack results
        missing_run = keep['missing_run']
        running = keep['running']
        problematic_run = keep['problematic_run']
        if all_completed:
            final_status += ' completed'
        else:
            if missing_run:
                final_status += ' |Missing: ' + " ".join([i[0] for i in missing_run])
            if running:
                final_status += ' |Running: ' + " ".join([i[0] for i in running])
            if problematic_run:
                final_status += ' |Problem: ' + " ".join([i[0] for i in problematic_run])

        # add dictionaries to main ones
        check.brief_output.append(final_status)
        print(final_status)
        if running:
            check.full_output['running_runs'].append({set_acc: running})
        if missing_run:
            check.full_output['needs_runs'].append({set_acc: missing_run})
        if problematic_run:
            check.full_output['problematic_runs'].append({set_acc: problematic_run})
        # if made it till the end
        if complete.get('add_tag'):
            assert not running
            assert not problematic_run
            assert not missing_run
            check.full_output['completed_runs'].append(complete)

    # complete check values
    check.summary = ""
    if check.full_output['running_runs']:
        check.summary = str(len(check.full_output['running_runs'])) + ' running|'
    if check.full_output['skipped']:
        check.summary += str(len(check.full_output['skipped'])) + ' skipped|'
        check.status = 'WARN'
    if check.full_output['needs_runs']:
        check.summary += str(len(check.full_output['needs_runs'])) + ' missing|'
        check.status = 'WARN'
        check.allow_action = True
    if check.full_output['completed_runs']:
        check.summary += str(len(check.full_output['completed_runs'])) + ' completed|'
        check.status = 'WARN'
        check.allow_action = True
    if check.full_output['problematic_runs']:
        check.summary += str(len(check.full_output['problematic_runs'])) + ' problem|'
        check.status = 'WARN'
    return check

예제 #34

0

파일 보기

파일: badge_checks.py 프로젝트: 4dn-dcic/foursight

def repsets_have_bio_reps(connection, **kwargs):
    '''
    Check for replicate experiment sets that have one of the following issues:
    1) Only a single biological replicate (includes sets with single experiment)
    2) Biological replicate numbers that are not in sequence
    3) Technical replicate numbers that are not in sequence

    Action patches badges with a message detailing which of the above issues is relevant.
    '''
    check = CheckResult(connection, 'repsets_have_bio_reps')

    results = ff_utils.search_metadata('search/?type=ExperimentSetReplicate&frame=object',
                                       key=connection.ff_keys, page_limit=50)

    audits = {
        REV_KEY: {'single_biorep': [], 'biorep_nums': [], 'techrep_nums': []},
        RELEASED_KEY: {'single_biorep': [], 'biorep_nums': [], 'techrep_nums': []}
    }
    by_exp = {}
    for result in results:
        rep_dict = {}
        exp_audits = []
        if result.get('replicate_exps'):
            rep_dict = {}
            for exp in result['replicate_exps']:
                if exp['bio_rep_no'] in rep_dict.keys():
                    rep_dict[exp['bio_rep_no']].append(exp['tec_rep_no'])
                else:
                    rep_dict[exp['bio_rep_no']] = [exp['tec_rep_no']]
        if rep_dict:
            if result.get('status') in REV:
                audit_key = REV_KEY
            else:
                audit_key = RELEASED_KEY

            # check if single biological replicate
            if len(rep_dict.keys()) == 1:
                # this tag labels an ExpSet with many replicates, but only one present in the database (typically imaging datasets)
                if 'many_replicates' in result.get('tags', []):  # skip false positive
                    continue
                audits[audit_key]['single_biorep'].append(result['@id'])
                exp_audits.append('Replicate set contains only a single biological replicate')
            # check if bio rep numbers not in sequence
            if sorted(list(rep_dict.keys())) != list(range(min(rep_dict.keys()), max(rep_dict.keys()) + 1)):
                audits[audit_key]['biorep_nums'].append('{} - bio rep #s:'
                                             ' {}'.format(result['@id'], str(sorted(list(rep_dict.keys())))))
                exp_audits.append('Biological replicate numbers are not in sequence')
        # check if tech rep numbers not in sequence
            for key, val in rep_dict.items():
                if sorted(val) != list(range(min(val), max(val) + 1)):
                    audits[audit_key]['techrep_nums'].append('{} - tech rep #s of biorep {}:'
                                                  ' {}'.format(result['@id'], key, str(sorted(val))))
                    exp_audits.append('Technical replicate numbers of biological replicate {}'
                                      ' are not in sequence'.format(key))
        if exp_audits and result.get('status') not in REV:
            by_exp[result['@id']] = sorted(exp_audits)

    to_add, to_remove, to_edit, ok = compare_badges_and_messages(by_exp, 'ExperimentSetReplicate',
                                                                 'replicate-numbers', connection.ff_keys)
    check.action = 'patch_badges_for_replicate_numbers'
    if to_add or to_remove or to_edit:
        check.status = 'WARN'
        check.summary = 'Replicate number badges need patching'
        check.description = '{} replicate experiment sets need replicate badges patched'.format(
            len(to_add.values()) + len(to_remove.values()) + len(to_edit.values())
        )
        check.allow_action = True
    else:
        check.status = 'PASS'
        check.summary = 'Replicate number badges up-to-date'
        check.description = 'No replicate number badges need patching'
    check.full_output = {'Add badge': to_add,
                         'Remove badge': to_remove,
                         'Keep badge and edit messages': to_edit,
                         'Keep badge (no change)': len(ok)}
    check.brief_output = {REV_KEY: audits[REV_KEY]}
    check.brief_output[RELEASED_KEY] = {
        k: {'single_biorep': [], 'biorep_nums': [], 'techrep_nums': []} for k in check.full_output.keys()
    }
    for k, v in audits[RELEASED_KEY].items():
        nochg_cnt = 0
        for item in v:
            name = item.split(' ')[0]
            for key in ["Add badge", 'Remove badge', 'Keep badge and edit messages']:
                if name in check.full_output[key].keys():
                    check.brief_output[RELEASED_KEY][key][k].append(item)
            if name in ok:
                nochg_cnt += 1
        check.brief_output[RELEASED_KEY]['Keep badge (no change)'][k] = nochg_cnt
    return check