Пример #1
0
def run_json(input_files, env, wf_info, run_name):
    my_s3_util = s3Utils(env=env)
    out_bucket = my_s3_util.outfile_bucket
    """Creates the trigger json that is used by foufront endpoint.
    """
    input_json = {
        'input_files': input_files,
        'output_bucket': out_bucket,
        'workflow_uuid': wf_info['wf_uuid'],
        "app_name": wf_info['wf_name'],
        "wfr_meta": wf_info['wfr_meta'],
        "parameters": wf_info['parameters'],
        "config": {
            "ebs_type": "gp2",
            "spot_instance": True,
            "json_bucket": "4dn-aws-pipeline-run-json",
            "ebs_iops": "",
            "shutdown_min": "now",
            "copy_to_s3": True,
            "launch_instance": True,
            "password": "",
            "log_bucket": "tibanna-output",
            "key_name": "4dn-encode"
        },
        "_tibanna": {
            "env": env,
            "run_type": wf_info['wf_name'],
            "run_id": run_name
        }
    }
    # overwrite or add custom fields
    for a_key in ['config', 'custom_pf_fields', 'overwrite_input_extra']:
        if a_key in wf_info:
            input_json[a_key] = wf_info[a_key]
    return input_json
Пример #2
0
def extract_file_info(obj_id, arg_name, env, rename=[]):
    auth = ff_utils.get_authentication_with_server({}, ff_env=env)
    my_s3_util = s3Utils(env=env)

    raw_bucket = my_s3_util.raw_file_bucket
    out_bucket = my_s3_util.outfile_bucket
    """Creates the formatted dictionary for files.
    """
    # start a dictionary
    template = {"workflow_argument_name": arg_name}
    if rename:
        change_from = rename[0]
        change_to = rename[1]
    # if it is list of items, change the structure
    if isinstance(obj_id, list):
        object_key = []
        uuid = []
        buckets = []
        for obj in obj_id:
            metadata = ff_utils.get_metadata(obj, key=auth)
            object_key.append(metadata['display_title'])
            uuid.append(metadata['uuid'])
            # get the bucket
            if 'FileProcessed' in metadata['@type']:
                my_bucket = out_bucket
            else:  # covers cases of FileFastq, FileReference, FileMicroscopy
                my_bucket = raw_bucket
            buckets.append(my_bucket)
        # check bucket consistency
        try:
            assert len(list(set(buckets))) == 1
        except AssertionError:
            print('Files from different buckets', obj_id)
            return
        template['object_key'] = object_key
        template['uuid'] = uuid
        template['bucket_name'] = buckets[0]
        if rename:
            template['rename'] = [
                i.replace(change_from, change_to)
                for i in template['object_key']
            ]

    # if obj_id is a string
    else:
        metadata = ff_utils.get_metadata(obj_id, key=auth)
        template['object_key'] = metadata['display_title']
        template['uuid'] = metadata['uuid']
        # get the bucket
        if 'FileProcessed' in metadata['@type']:
            my_bucket = out_bucket
        else:  # covers cases of FileFastq, FileReference, FileMicroscopy
            my_bucket = raw_bucket
        template['bucket_name'] = my_bucket
        if rename:
            template['rename'] = template['object_key'].replace(
                change_from, change_to)
    return template
Пример #3
0
def run_missing_atac1(wf_info, organism, paired, files, obj_keys, my_env,
                      my_key, run_name):
    my_s3_util = s3Utils(env=my_env)
    raw_bucket = my_s3_util.raw_file_bucket
    out_bucket = my_s3_util.outfile_bucket

    if organism == "human":
        org = 'hs'
        input_files = [{
            "object_key": "4DNFIMQPTYDY.bowtie2Index.tar",
            "rename": "GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.tar",
            "bucket_name": raw_bucket,
            "workflow_argument_name": "atac.bowtie2_idx_tar",
            "uuid": "28ab6265-f426-4a23-bb8a-f28467ad505b"
        }, {
            "object_key": "4DNFIZ1TGJZR.bed.gz",
            "bucket_name": raw_bucket,
            "workflow_argument_name": "atac.blacklist",
            "uuid": "9562ffbd-9f7a-4bd7-9c10-c335137d8966"
        }, {
            "object_key": "4DNFIZJB62D1.chrom.sizes",
            "bucket_name": raw_bucket,
            "workflow_argument_name": "atac.chrsz",
            "uuid": "9866d158-da3c-4d9b-96a9-1d59632eabeb"
        }]

    elif organism == "mouse":
        org = 'mm'
        input_files = [{
            "object_key": "4DNFI2493SDN.bowtie2Index.tar",
            "rename": "mm10_no_alt_analysis_set_ENCODE.fasta.tar",
            "bucket_name": raw_bucket,
            "workflow_argument_name": "atac.bowtie2_idx_tar",
            "uuid": "63e22058-79c6-4e24-8231-ca4afac29dda"
        }, {
            "object_key": "4DNFIZ3FBPK8.bed.gz",
            "bucket_name": raw_bucket,
            "workflow_argument_name": "atac.blacklist",
            "uuid": "a32747a3-8a9e-4a9e-a7a1-4db0e8b65925"
        }, {
            "object_key": "4DNFIBP173GC.chrom.sizes",
            "bucket_name": raw_bucket,
            "workflow_argument_name": "atac.chrsz",
            "uuid": "be0a9819-d2ce-4422-be4b-234fb1677dd9"
        }]

    input_files.append({
        "object_key": obj_keys,
        "bucket_name": raw_bucket,
        "workflow_argument_name": "atac.fastqs",
        "uuid": files
    })

    if paired == 'single':
        chip_p = False
    elif paired == 'paired':
        chip_p = True
    parameters = {
        "atac.pipeline_type": 'atac',
        "atac.paired_end": chip_p,
        "atac.gensz": org,
        "atac.bam2ta.regex_grep_v_ta": "chr[MUE]|random|alt",
        "atac.disable_ataqc": True,
        "atac.enable_xcor": False,
        "atac.trim_adapter.auto_detect_adapter": True,
        "atac.bowtie2.cpu": 4,
        "atac.filter.cpu": 4,
        "atac.bam2ta.cpu": 4,
        "atac.trim_adapter.cpu": 4,
        "atac.align_only": True
    }

    if paired == 'single':
        frag_temp = [300]
        fraglist = frag_temp * len(files)
        parameters['atac.fraglen'] = fraglist

    tag = '1.1.1'
    """Creates the trigger json that is used by foufront endpoint.
    """
    input_json = {
        'input_files': input_files,
        'output_bucket': out_bucket,
        'workflow_uuid': wf_info['wf_uuid'],
        "app_name": wf_info['wf_name'],
        "wfr_meta": wf_info['wfr_meta'],
        "parameters": parameters,
        "config": wf_info['config'],
        "custom_pf_fields": wf_info['custom_pf_fields'],
        "_tibanna": {
            "env": my_env,
            "run_type": wf_info['wf_name'],
            "run_id": run_name
        },
        "tag": tag
    }
    # r = json.dumps(input_json)
    # print(r)
    e = ff_utils.post_metadata(input_json, 'WorkflowRun/run', key=my_key)
    url = json.loads(e['input'])['_tibanna']['url']
    display(
        HTML("<a href='{}' target='_blank'>{}</a>".format(url, e['status'])))
Пример #4
0
def run_missing_chip2(control_set, wf_info, organism, target_type, paired, ta,
                      ta_xcor, ta_cnt, my_env, my_key, run_ids):
    my_s3_util = s3Utils(env=my_env)
    raw_bucket = my_s3_util.raw_file_bucket
    out_bucket = my_s3_util.outfile_bucket

    if organism == "human":
        org = 'hs'
        input_files = [{
            "object_key": "4DNFIZ1TGJZR.bed.gz",
            "bucket_name": raw_bucket,
            "workflow_argument_name": "chip.blacklist",
            "uuid": "9562ffbd-9f7a-4bd7-9c10-c335137d8966"
        }, {
            "object_key": "4DNFIZJB62D1.chrom.sizes",
            "bucket_name": raw_bucket,
            "workflow_argument_name": "chip.chrsz",
            "uuid": "9866d158-da3c-4d9b-96a9-1d59632eabeb"
        }]

    elif organism == "mouse":
        org = 'mm'
        input_files = [{
            "object_key": "4DNFIZ3FBPK8.bed.gz",
            "bucket_name": raw_bucket,
            "workflow_argument_name": "chip.blacklist",
            "uuid": "a32747a3-8a9e-4a9e-a7a1-4db0e8b65925"
        }, {
            "object_key": "4DNFIBP173GC.chrom.sizes",
            "bucket_name": raw_bucket,
            "workflow_argument_name": "chip.chrsz",
            "uuid": "be0a9819-d2ce-4422-be4b-234fb1677dd9"
        }]

    ta_f = extract_file_info(ta,
                             'chip.tas',
                             my_env,
                             rename=['bed', 'tagAlign'])
    input_files.append(ta_f)
    ta_xcor_f = extract_file_info(ta_xcor,
                                  'chip.bam2ta_no_filt_R1.ta',
                                  my_env,
                                  rename=['bed', 'tagAlign'])
    input_files.append(ta_xcor_f)
    if control_set:
        ta_cnt = extract_file_info(ta_cnt,
                                   'chip.ctl_tas',
                                   my_env,
                                   rename=['bed', 'tagAlign'])
        input_files.append(ta_cnt)

    if paired == 'single':
        chip_p = False
    elif paired == 'paired':
        chip_p = True
    if not control_set:
        if target_type == 'histone':
            print(
                'HISTONE WITHOUT CONTROL NEEDS ATTENTION (change to tf), skipping for now'
            )
            return

    parameters = {
        "chip.pipeline_type": target_type,
        "chip.paired_end": chip_p,
        "chip.choose_ctl.always_use_pooled_ctl": True,
        "chip.qc_report.name": run_ids['run_name'],
        "chip.qc_report.desc": run_ids['desc'],
        "chip.gensz": org,
        "chip.xcor.cpu": 4,
        "chip.spp_cpu": 4
    }

    if paired == 'single':
        frag_temp = [300]
        fraglist = frag_temp * len(ta)
        parameters['chip.fraglen'] = fraglist

    tag = '1.1.1'
    """Creates the trigger json that is used by foufront endpoint.
    """
    input_json = {
        'input_files': input_files,
        'output_bucket': out_bucket,
        'workflow_uuid': wf_info['wf_uuid'],
        "app_name": wf_info['wf_name'],
        "wfr_meta": wf_info['wfr_meta'],
        "parameters": parameters,
        "config": wf_info['config'],
        "custom_pf_fields": wf_info['custom_pf_fields'],
        "_tibanna": {
            "env": my_env,
            "run_type": wf_info['wf_name'],
            "run_id": run_ids['run_name']
        },
        "tag": tag
    }
    # r = json.dumps(input_json)
    # print(r)
    e = ff_utils.post_metadata(input_json, 'WorkflowRun/run', key=my_key)
    url = json.loads(e['input'])['_tibanna']['url']
    display(
        HTML("<a href='{}' target='_blank'>{}</a>".format(url, e['status'])))
Пример #5
0
def run_missing_chip1(control, wf_info, organism, target_type, paired, files,
                      obj_keys, my_env, my_key, run_name):
    my_s3_util = s3Utils(env=my_env)
    raw_bucket = my_s3_util.raw_file_bucket
    out_bucket = my_s3_util.outfile_bucket

    if organism == "human":
        org = 'hs'
        input_files = [{
            "object_key": "4DNFIZQB369V.bwaIndex.tar",
            "rename": "GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.tar",
            "bucket_name": raw_bucket,
            "workflow_argument_name": "chip.bwa_idx_tar",
            "uuid": "38077b98-3862-45cd-b4be-8e28e9494549"
        }, {
            "object_key": "4DNFIZ1TGJZR.bed.gz",
            "bucket_name": raw_bucket,
            "workflow_argument_name": "chip.blacklist",
            "uuid": "9562ffbd-9f7a-4bd7-9c10-c335137d8966"
        }, {
            "object_key": "4DNFIZJB62D1.chrom.sizes",
            "bucket_name": raw_bucket,
            "workflow_argument_name": "chip.chrsz",
            "uuid": "9866d158-da3c-4d9b-96a9-1d59632eabeb"
        }]

    elif organism == "mouse":
        org = 'mm'
        input_files = [{
            "object_key": "4DNFIZ2PWCC2.bwaIndex.tar",
            "rename": "mm10_no_alt_analysis_set_ENCODE.fasta.tar",
            "bucket_name": raw_bucket,
            "workflow_argument_name": "chip.bwa_idx_tar",
            "uuid": "f4b63d31-65d8-437f-a76a-6bedbb52ae6f"
        }, {
            "object_key": "4DNFIZ3FBPK8.bed.gz",
            "bucket_name": raw_bucket,
            "workflow_argument_name": "chip.blacklist",
            "uuid": "a32747a3-8a9e-4a9e-a7a1-4db0e8b65925"
        }, {
            "object_key": "4DNFIBP173GC.chrom.sizes",
            "bucket_name": raw_bucket,
            "workflow_argument_name": "chip.chrsz",
            "uuid": "be0a9819-d2ce-4422-be4b-234fb1677dd9"
        }]
    if control:
        input_files.append({
            "object_key": obj_keys,
            "bucket_name": raw_bucket,
            "workflow_argument_name": "chip.ctl_fastqs",
            "uuid": files
        })
    else:
        input_files.append({
            "object_key": obj_keys,
            "bucket_name": raw_bucket,
            "workflow_argument_name": "chip.fastqs",
            "uuid": files
        })

    if paired == 'single':
        chip_p = False
    elif paired == 'paired':
        chip_p = True
    if control:
        parameters = {
            "chip.pipeline_type": target_type,
            "chip.paired_end": chip_p,
            "chip.choose_ctl.always_use_pooled_ctl": True,
            "chip.gensz": org,
            "chip.bam2ta_ctl.regex_grep_v_ta": "chr[MUE]|random|alt",
            "chip.bwa_ctl.cpu": 8,
            "chip.merge_fastq_ctl.cpu": 8,
            "chip.filter_ctl.cpu": 8,
            "chip.bam2ta_ctl.cpu": 8,
            "chip.align_only": True
        }
    else:
        parameters = {
            "chip.pipeline_type": target_type,
            "chip.paired_end": chip_p,
            "chip.choose_ctl.always_use_pooled_ctl": True,
            "chip.gensz": org,
            "chip.bam2ta.regex_grep_v_ta": "chr[MUE]|random|alt",
            "chip.bwa.cpu": 8,
            "chip.merge_fastq.cpu": 8,
            "chip.filter.cpu": 8,
            "chip.bam2ta.cpu": 8,
            "chip.xcor.cpu": 8,
            "chip.align_only": True
        }
    if paired == 'single':
        frag_temp = [300]
        fraglist = frag_temp * len(files)
        parameters['chip.fraglen'] = fraglist

    tag = '1.1.1'
    """Creates the trigger json that is used by foufront endpoint.
    """
    input_json = {
        'input_files': input_files,
        'output_bucket': out_bucket,
        'workflow_uuid': wf_info['wf_uuid'],
        "app_name": wf_info['wf_name'],
        "wfr_meta": wf_info['wfr_meta'],
        "parameters": parameters,
        "config": wf_info['config'],
        "custom_pf_fields": wf_info['custom_pf_fields'],
        "_tibanna": {
            "env": my_env,
            "run_type": wf_info['wf_name'],
            "run_id": run_name
        },
        "tag": tag
    }
    # r = json.dumps(input_json)
    # print(r)
    e = ff_utils.post_metadata(input_json, 'WorkflowRun/run', key=my_key)
    url = json.loads(e['input'])['_tibanna']['url']
    display(
        HTML("<a href='{}' target='_blank'>{}</a>".format(url, e['status'])))
Пример #6
0
def find_pairs(my_rep_set, my_env, lookfor='pairs', exclude_miseq=True):
    auth = ff_utils.get_authentication_with_server({}, ff_env=my_env)
    my_s3_util = s3Utils(env=my_env)
    """Find fastq files from experiment set, exclude miseq.
    """
    report = {}
    rep_resp = my_rep_set['experiments_in_set']
    lab = [my_rep_set['lab']['@id']]
    enzymes = []
    organisms = []
    total_f_size = 0
    for exp in rep_resp:

        exp_resp = exp

        report[exp['accession']] = []
        if not organisms:
            biosample = exp['biosample']
            organisms = list(
                set([
                    bs['individual']['organism']['name']
                    for bs in biosample['biosource']
                ]))
            if len(organisms) != 1:
                print('multiple organisms in set', my_rep_set['accession'])
                break
        exp_files = exp['files']
        enzyme = exp.get('digestion_enzyme')
        if enzyme:
            enzymes.append(enzyme['display_title'])

        for fastq_file in exp_files:
            file_resp = ff_utils.get_metadata(fastq_file['uuid'], key=auth)
            if not file_resp.get('file_size'):
                print("WARNING!", file_resp['accession'],
                      'does not have filesize')
            else:
                total_f_size += file_resp['file_size']
            # skip pair no 2
            if file_resp.get('paired_end') == '2':
                continue
            # exclude miseq
            if exclude_miseq:
                if file_resp.get('instrument') == 'Illumina MiSeq':
                    # print 'skipping miseq files', exp
                    continue
            # Some checks before running
            # check if status is deleted
            if file_resp['status'] == 'deleted':
                print('deleted file', file_resp['accession'], 'in',
                      my_rep_set['accession'])
                continue
            # if no uploaded file in the file item report and skip
            if not file_resp.get('filename'):
                print(file_resp['accession'], "does not have a file")
                continue
            # check if file is in s3

            head_info = my_s3_util.does_key_exist(file_resp['upload_key'],
                                                  my_s3_util.raw_file_bucket)

            if not head_info:
                print(file_resp['accession'], "does not have a file in S3")
                continue
            # check that file has a pair
            f1 = file_resp['@id']
            f2 = ""
            paired = ""
            # is there a pair?
            try:
                relations = file_resp['related_files']
                paired_files = [
                    relation['file']['@id'] for relation in relations
                    if relation['relationship_type'] == 'paired with'
                ]
                assert len(paired_files) == 1
                f2 = paired_files[0]
                paired = "Yes"
            except:
                paired = "No"

            # for experiments with unpaired fastq files
            if lookfor == 'single':
                if paired == 'No':
                    report[exp_resp['accession']].append(f1)
                else:
                    print('expected single files, found paired end')
                    return
            # for experiments with paired files
            else:
                if paired != 'Yes':
                    print('expected paired files, found single end')
                    return
                f2 = ''
                relations = file_resp.get('related_files')

                if not relations:
                    print(f1, 'does not have a pair')
                    return
                for relation in relations:
                    if relation['relationship_type'] == 'paired with':
                        f2 = relation['file']['@id']
                if not f2:
                    print(f1, 'does not have a pair')
                    return

                report[exp_resp['accession']].append((f1, f2))
    # get the organism
    if len(list(set(organisms))) == 1:
        organism = organisms[0]
    else:
        organism = None

    # get the enzyme
    if len(list(set(enzymes))) == 1:
        enz = enzymes[0]
    else:
        enz = None

    bwa = bwa_index.get(organism)
    chrsize = chr_size.get(organism)
    if re_nz.get(organism):
        enz_file = re_nz[organism].get(enz)
    else:
        print('no enzyme information for the organism {}'.format(organism))
        enz_file = None

    return report, organism, enz, bwa, chrsize, enz_file, int(
        total_f_size / (1024 * 1024 * 1024)), lab
Пример #7
0
def run_missing_atac2(wf_info, organism, paired, ta, my_env, my_key, run_name):
    my_s3_util = s3Utils(env=my_env)
    raw_bucket = my_s3_util.raw_file_bucket
    out_bucket = my_s3_util.outfile_bucket

    if organism == "human":
        org = 'hs'
        input_files = [{
            "object_key": "4DNFIZ1TGJZR.bed.gz",
            "bucket_name": raw_bucket,
            "workflow_argument_name": "atac.blacklist",
            "uuid": "9562ffbd-9f7a-4bd7-9c10-c335137d8966"
        }, {
            "object_key": "4DNFIZJB62D1.chrom.sizes",
            "bucket_name": raw_bucket,
            "workflow_argument_name": "atac.chrsz",
            "uuid": "9866d158-da3c-4d9b-96a9-1d59632eabeb"
        }]

    elif organism == "mouse":
        org = 'mm'
        input_files = [{
            "object_key": "4DNFIZ3FBPK8.bed.gz",
            "bucket_name": raw_bucket,
            "workflow_argument_name": "atac.blacklist",
            "uuid": "a32747a3-8a9e-4a9e-a7a1-4db0e8b65925"
        }, {
            "object_key": "4DNFIBP173GC.chrom.sizes",
            "bucket_name": raw_bucket,
            "workflow_argument_name": "atac.chrsz",
            "uuid": "be0a9819-d2ce-4422-be4b-234fb1677dd9"
        }]

    ta_f = extract_file_info(ta,
                             'atac.tas',
                             my_env,
                             rename=['bed', 'tagAlign'])
    input_files.append(ta_f)

    if paired == 'single':
        chip_p = False
    elif paired == 'paired':
        chip_p = True

    parameters = {
        "atac.pipeline_type": 'atac',
        "atac.paired_end": chip_p,
        "atac.gensz": org,
        "atac.disable_ataqc": True,
        "atac.enable_xcor": False,
    }

    if paired == 'single':
        frag_temp = [300]
        fraglist = frag_temp * len(ta)
        parameters['atac.fraglen'] = fraglist

    tag = '1.1.1'
    """Creates the trigger json that is used by foufront endpoint.
    """
    input_json = {
        'input_files': input_files,
        'output_bucket': out_bucket,
        'workflow_uuid': wf_info['wf_uuid'],
        "app_name": wf_info['wf_name'],
        "wfr_meta": wf_info['wfr_meta'],
        "parameters": parameters,
        "config": wf_info['config'],
        "custom_pf_fields": wf_info['custom_pf_fields'],
        "_tibanna": {
            "env": my_env,
            "run_type": wf_info['wf_name'],
            "run_id": run_name
        },
        "tag": tag
    }
    # r = json.dumps(input_json)
    # print(r)
    e = ff_utils.post_metadata(input_json, 'WorkflowRun/run', key=my_key)
    url = json.loads(e['input'])['_tibanna']['url']
    display(
        HTML("<a href='{}' target='_blank'>{}</a>".format(url, e['status'])))