예제 #1
0
def main():
    """Program
    """
    parser = get_parser()
    args = parser.parse_args()
    mode = args.dcc_mode
    exp_id = args.exp_id

    conn = Connection(mode)
    exp_rep_dico = conn.get_fastqfile_replicate_hash(exp_id)
    exp_json = conn.get(exp_id, ignore404=True)
    controls = exp_json["possible_controls"]  # A list of dicts.

    # Populate a controls-lookup hash. The keys will be the ctl accessions. Each value will be
    # the replicates hash (return value of conn.get_fastqfile_replicate_hash().
    controls_hash = {}  # A dict of dicts.
    control_bio_rep_counts = []
    for c in controls:
        ctl_accession = c["accession"]
        controls_hash[ctl_accession] = {}
        ctl_rep_dico = conn.get_fastqfile_replicate_hash(ctl_accession)
        controls_hash[ctl_accession]["rep_dico"] = ctl_rep_dico
        control_bio_rep_counts.append(len(ctl_rep_dico.keys()))

    # Make sure that all control experiments have the same number of biological replicates. There are
    # no known rules to apply otherwise.
    if len(set(control_bio_rep_counts)) != 1:
        raise Exception(
            "The controls '{controls}' have different numbers of biological replicates from one another '{rep_nums}'."
            .format(controls=control_ids, rep_nums=control_bio_rep_counts))

    # Make sure that the number of control bio reps equals the number of experiment bio reps:
    exp_bio_rep_count = len(exp_rep_dico.keys())
    if exp_bio_rep_count != control_bio_rep_counts[0]:
        raise Exception(
            "The number of experiment replicates '{}' doesn't equal the number of control replicates '{}'."
            .format(exp_bio_rep_count, control_bio_rep_counts[0]))

    # Now we'll look at each bio rep on the experiment, in numerical order of
    # biological_replicate_number from least to greatest. We'll work our way all the down to the
    # FASTQ files and start populating the File.controlled_by property in the following manner:
    #
    #  For each control, we'll sort the replicates the same was as we did for the ones on the
    #  experiment, then for the replicate having the same ordinal index, we'll add the FASTQ File
    #  references.

    sorted_exp_bio_reps = sorted(exp_rep_dico)
    count = -1
    # And now for the nastiest for-loop I've ever written ... this should be cleaned up but the logic
    # is so rough to implement that it'll be ugly any way we look at it.
    for b in sorted_exp_bio_reps:  # biological_replicate_number
        count += 1
        for t in exp_rep_dico[b]:  # technical_replicate_number
            for read_num in exp_rep_dico[b][t]:
                for fastq_json in exp_rep_dico[b][t][read_num]:
                    exp_file_acc = fastq_json["accession"]
                    controlled_by = []
                    for c in controls_hash:
                        ctl_bio_rep_num = sorted(
                            controls_hash[c]["rep_dico"])[count]
                        ctl_tech_reps = controls_hash[c]["rep_dico"][
                            ctl_bio_rep_num]
                        for ctl_tech_rep_num in ctl_tech_reps:
                            for ctl_encff in ctl_tech_reps[ctl_tech_rep_num][
                                    read_num]:
                                controlled_by.append(ctl_encff["accession"])
                    conn.patch(
                        {
                            conn.ENCID_KEY: exp_file_acc,
                            "controlled_by": controlled_by
                        },
                        extend_array_values=False)
class Accession(object):
    """docstring for Accession"""
    def __init__(self, steps, metadata_json, server, lab, award):
        super(Accession, self).__init__()
        self.set_lab_award(lab, award)
        self.analysis = Analysis(metadata_json)
        self.steps_and_params_json = self.file_to_json(steps)
        self.backend = self.analysis.backend
        self.conn = Connection(server)
        self.new_files = []
        self.current_user = self.get_current_user()

    def set_lab_award(self, lab, award):
        global COMMON_METADATA
        COMMON_METADATA['lab'] = lab
        COMMON_METADATA['award'] = award

    def get_current_user(self):
        response = requests.get(self.conn.dcc_url + '/session-properties',
                                auth=self.conn.auth)
        if response.ok:
            user = response.json().get('user')
            if user:
                return user.get('@id')
            raise Exception('Authenticated user not found')
        else:
            raise Exception('Request to portal failed')

    def file_to_json(self, file):
        with open(file) as json_file:
            json_obj = json.load(json_file)
        return json_obj

    def file_to_json(self, file):
        with open(file) as json_file:
            json_obj = json.load(json_file)
        return json_obj

    def file_to_json(self, file):
        with open(file) as json_file:
            json_obj = json.load(json_file)
        return json_obj

    def accession_fastqs(self):
        pass

    def wait_for_portal(self):
        pass

    def file_at_portal(self, file):
        self.wait_for_portal()
        md5sum = self.backend.md5sum(file)
        search_param = [('md5sum', md5sum), ('type', 'File')]
        encode_file = self.conn.search(search_param)
        if len(encode_file) > 0:
            return self.conn.get(encode_file[0].get('accession'))

    def raw_fastq_inputs(self, file):
        if not file.task and 'fastqs' in file.filekeys:
            yield file
        if file.task:
            for input_file in file.task.input_files:
                yield from self.raw_fastq_inputs(input_file)

    def raw_files_accessioned(self):
        for file in self.analysis.raw_fastqs:
            if not self.file_at_portal(file.filename):
                return False
        return True

    def accession_file(self, encode_file, gs_file):
        file_exists = self.file_at_portal(gs_file.filename)
        submitted_file_path = {'submitted_file_name': gs_file.filename}
        if not file_exists:
            local_file = self.backend.download(gs_file.filename)[0]
            encode_file['submitted_file_name'] = local_file
            encode_posted_file = self.conn.post(encode_file)
            os.remove(local_file)
            encode_posted_file = self.patch_file(encode_posted_file,
                                                 submitted_file_path)
            self.new_files.append(encode_posted_file)
            return encode_posted_file
        elif (file_exists
              and file_exists.get('status') in ['deleted', 'revoked']):
            encode_file.update(submitted_file_path)
            # Update the file to current user
            # TODO: Reverse this when duplicate md5sums are enabled
            encode_file.update({'submitted_by': self.current_user})
            encode_patched_file = self.patch_file(file_exists, encode_file)
            self.new_files.append(encode_patched_file)
            return encode_patched_file
        return file_exists

    def patch_file(self, encode_file, new_properties):
        new_properties[self.conn.ENCID_KEY] = encode_file.get('accession')
        return self.conn.patch(new_properties, extend_array_values=False)

    def get_or_make_step_run(self, lab_prefix, run_name, step_version,
                             task_name):
        docker_tag = self.analysis.get_tasks(task_name)[0].docker_image.split(
            ':')[1]
        payload = {
            'aliases': ["{}:{}-{}".format(lab_prefix, run_name, docker_tag)],
            'status': 'released',
            'analysis_step_version': step_version
        }
        payload[Connection.PROFILE_KEY] = 'analysis_step_runs'
        print(payload)
        return self.conn.post(payload)

    @property
    def assembly(self):
        assembly = [
            reference for reference in ASSEMBLIES
            if reference in self.analysis.get_tasks('read_genome_tsv')
            [0].outputs.get('genome', {}).get('ref_fa', '')
        ]
        return assembly[0] if len(assembly) > 0 else ''

    @property
    def lab_pi(self):
        return COMMON_METADATA['lab'].split('/labs/')[1].split('/')[0]

    @property
    def dataset(self):
        return self.file_at_portal(
            self.analysis.raw_fastqs[0].filename).get('dataset')

    def file_from_template(self,
                           file,
                           file_format,
                           output_type,
                           step_run,
                           derived_from,
                           dataset,
                           file_format_type=None):
        file_name = file.filename.split('gs://')[-1].replace('/', '-')
        obj = {
            'status': 'uploading',
            'aliases': ['{}:{}'.format(self.lab_pi, file_name)],
            'file_format': file_format,
            'output_type': output_type,
            'assembly': self.assembly,
            'dataset': dataset,
            'step_run': step_run.get('@id'),
            'derived_from': derived_from,
            'file_size': file.size,
            'md5sum': file.md5sum
        }
        if file_format_type:
            obj['file_format_type'] = file_format_type
        obj[Connection.PROFILE_KEY] = 'file'
        obj.update(COMMON_METADATA)
        return obj

    def get_derived_from_all(self, file, files, inputs=False):
        ancestors = []
        for ancestor in files:
            ancestors.append(
                self.get_derived_from(file, ancestor.get('derived_from_task'),
                                      ancestor.get('derived_from_filekey'),
                                      ancestor.get('derived_from_output_type'),
                                      ancestor.get('derived_from_inputs')))
        return list(self.flatten(ancestors))

    def flatten(self, nested_list):
        if isinstance(nested_list, str):
            yield nested_list
        if isinstance(nested_list, list):
            for item in nested_list:
                yield from self.flatten(item)

    # Returns list of accession ids of files on portal or recently accessioned
    def get_derived_from(self,
                         file,
                         task_name,
                         filekey,
                         output_type=None,
                         inputs=False):
        derived_from_files = list(
            set(
                list(
                    self.analysis.search_up(file.task, task_name, filekey,
                                            inputs))))
        encode_files = [
            self.file_at_portal(gs_file.filename)
            for gs_file in derived_from_files
        ]
        accessioned_files = encode_files + self.new_files
        accessioned_files = [x for x in accessioned_files if x is not None]
        derived_from_accession_ids = []
        for gs_file in derived_from_files:
            for encode_file in accessioned_files:
                if gs_file.md5sum == encode_file.get('md5sum'):
                    # Optimal peaks can be mistaken for conservative peaks
                    # when their md5sum is the same
                    if output_type and output_type != encode_file.get(
                            'output_type'):
                        continue
                    derived_from_accession_ids.append(
                        encode_file.get('accession'))
        derived_from_accession_ids = list(set(derived_from_accession_ids))

        # Raise exception when some or all of the derived_from files
        # are missing from the portal
        if not derived_from_accession_ids:
            raise Exception(
                'Missing all of the derived_from files on the portal')
        if len(derived_from_accession_ids) != len(derived_from_files):
            raise Exception(
                'Missing some of the derived_from files on the portal')
        return [
            '/files/{}/'.format(accession_id)
            for accession_id in derived_from_accession_ids
        ]

    # File object to be accessioned
    # inputs=True will search for input fastqs in derived_from

    def make_file_obj(self,
                      file,
                      file_format,
                      output_type,
                      step_run,
                      derived_from_files,
                      file_format_type=None,
                      inputs=False):
        derived_from = self.get_derived_from_all(file, derived_from_files,
                                                 inputs)
        return self.file_from_template(file, file_format, output_type,
                                       step_run, derived_from, self.dataset,
                                       file_format_type)

    def get_bio_replicate(self, encode_file, string=True):
        replicate = encode_file.get('biological_replicates')[0]
        if string:
            return str(replicate)
        return int(replicate)

    def attach_idr_qc_to(self, encode_file, gs_file):
        if list(
                filter(lambda x: 'IDRQualityMetric' in x['@type'],
                       encode_file['quality_metrics'])):
            return
        qc = self.backend.read_json(self.analysis.get_files('qc_json')[0])
        idr_qc = qc['idr_frip_qc']
        replicate = self.get_bio_replicate(encode_file)
        rep_pr = idr_qc['rep' + replicate + '-pr']
        frip_score = rep_pr['FRiP']
        idr_peaks = qc['ataqc']['rep' + replicate]['IDR peaks'][0]
        step_run = encode_file.get('step_run')
        if isinstance(step_run, str):
            step_run_id = step_run
        elif isinstance(step_run, dict):
            step_run_id = step_run.get('@id')
        qc_object = {}
        qc_object['F1'] = frip_score
        qc_object['N1'] = idr_peaks
        idr_cutoff = self.analysis.metadata['inputs']['atac.idr_thresh']
        # Strongly expects that plot exists
        plot_png = next(
            self.analysis.search_up(gs_file.task, 'idr_pr', 'idr_plot'))
        qc_object.update({
            'step_run':
            step_run_id,
            'quality_metric_of': [encode_file.get('@id')],
            'IDR_cutoff':
            idr_cutoff,
            'status':
            'released',
            'IDR_plot_rep{}_pr'.format(replicate):
            self.get_attachment(plot_png, 'image/png')
        })
        qc_object.update(COMMON_METADATA)
        qc_object[Connection.PROFILE_KEY] = 'idr-quality-metrics'
        posted_qc = self.conn.post(qc_object, require_aliases=False)
        return posted_qc

    def attach_flagstat_qc_to(self, encode_bam_file, gs_file):
        # Return early if qc metric exists
        if list(
                filter(
                    lambda x: 'SamtoolsFlagstatsQualityMetric' in x['@type'],
                    encode_bam_file['quality_metrics'])):
            return
        qc = self.backend.read_json(self.analysis.get_files('qc_json')[0])
        replicate = self.get_bio_replicate(encode_bam_file)
        flagstat_qc = qc['nodup_flagstat_qc']['rep' + replicate]
        for key, value in flagstat_qc.items():
            if '_pct' in key:
                flagstat_qc[key] = '{}%'.format(value)
        step_run = encode_bam_file.get('step_run')
        if isinstance(step_run, str):
            step_run_id = step_run
        elif isinstance(step_run, dict):
            step_run_id = step_run.get('@id')
        flagstat_qc.update({
            'step_run': step_run_id,
            'quality_metric_of': [encode_bam_file.get('@id')],
            'status': 'released'
        })
        flagstat_qc.update(COMMON_METADATA)
        flagstat_qc[
            Connection.PROFILE_KEY] = 'samtools-flagstats-quality-metric'
        posted_qc = self.conn.post(flagstat_qc, require_aliases=False)
        return posted_qc

    def attach_cross_correlation_qc_to(self, encode_bam_file, gs_file):
        # Return early if qc metric exists
        if list(
                filter(lambda x: 'ComplexityXcorrQualityMetric' in x['@type'],
                       encode_bam_file['quality_metrics'])):
            return

        qc = self.backend.read_json(self.analysis.get_files('qc_json')[0])
        plot_pdf = next(
            self.analysis.search_down(gs_file.task, 'xcor', 'plot_pdf'))
        read_length_file = next(
            self.analysis.search_up(gs_file.task, 'bowtie2', 'read_len_log'))
        read_length = int(
            self.backend.read_file(read_length_file.filename).decode())
        replicate = self.get_bio_replicate(encode_bam_file)
        xcor_qc = qc['xcor_score']['rep' + replicate]
        pbc_qc = qc['pbc_qc']['rep' + replicate]
        step_run = encode_bam_file.get('step_run')
        if isinstance(step_run, str):
            step_run_id = step_run
        elif isinstance(step_run, dict):
            step_run_id = step_run.get('@id')

        xcor_object = {
            'NRF':
            pbc_qc['NRF'],
            'PBC1':
            pbc_qc['PBC1'],
            'PBC2':
            pbc_qc['PBC2'],
            'NSC':
            xcor_qc['NSC'],
            'RSC':
            xcor_qc['RSC'],
            'sample size':
            xcor_qc['num_reads'],
            "fragment length":
            xcor_qc['est_frag_len'],
            "quality_metric_of": [encode_bam_file.get('@id')],
            "step_run":
            step_run_id,
            "paired-end":
            self.analysis.metadata['inputs']['atac.paired_end'],
            "read length":
            read_length,
            "status":
            "released",
            "cross_correlation_plot":
            self.get_attachment(plot_pdf, 'application/pdf')
        }

        xcor_object.update(COMMON_METADATA)
        xcor_object[
            Connection.PROFILE_KEY] = 'complexity-xcorr-quality-metrics'
        posted_qc = self.conn.post(xcor_object, require_aliases=False)
        return posted_qc

    def file_has_qc(self, bam, qc):
        for item in bam['quality_metrics']:
            if item['@type'][0] == qc['@type'][0]:
                return True
        return False

    def get_attachment(self, gs_file, mime_type):
        contents = self.backend.read_file(gs_file.filename)
        contents = b64encode(contents)
        if type(contents) is bytes:
            # The Portal treats the contents as string "b'bytes'"
            contents = str(contents).replace('b', '', 1).replace('\'', '')
        obj = {
            'type': mime_type,
            'download': gs_file.filename.split('/')[-1],
            'href': 'data:{};base64,{}'.format(mime_type, contents)
        }
        return obj

    def accession_step(self, single_step_params):
        step_run = self.get_or_make_step_run(
            self.lab_pi, single_step_params['dcc_step_run'],
            single_step_params['dcc_step_version'],
            single_step_params['wdl_task_name'])
        accessioned_files = []
        for task in self.analysis.get_tasks(
                single_step_params['wdl_task_name']):
            for file_params in single_step_params['wdl_files']:
                for wdl_file in [
                        file for file in task.output_files
                        if file_params['filekey'] in file.filekeys
                ]:

                    # Conservative IDR thresholded peaks may have
                    # the same md5sum as optimal one
                    try:
                        obj = self.make_file_obj(
                            wdl_file,
                            file_params['file_format'],
                            file_params['output_type'],
                            step_run,
                            file_params['derived_from_files'],
                            file_format_type=file_params.get(
                                'file_format_type'))
                        encode_file = self.accession_file(obj, wdl_file)
                    except Exception as e:
                        if 'Conflict' in str(e) and file_params.get(
                                'possible_duplicate'):
                            continue
                        elif 'Missing all of the derived_from' in str(e):
                            continue
                        else:
                            raise

                    # Parameter file inputted assumes Accession implements
                    # the methods to attach the quality metrics
                    quality_metrics = file_params.get('quality_metrics', [])
                    for qc in quality_metrics:
                        qc_method = getattr(self, QC_MAP[qc])
                        # Pass encode file with
                        # calculated properties
                        qc_method(self.conn.get(encode_file.get('accession')),
                                  wdl_file)
                    accessioned_files.append(encode_file)
        return accessioned_files

    def accession_steps(self):
        for step in self.steps_and_params_json:
            self.accession_step(step)