Exemplo n.º 1
0
def main():
    """Program
    """
    parser = get_parser()
    args = parser.parse_args()
    infile = args.infile
    outfile = args.outfile
    dcc_mode = args.dcc_mode

    conn = Connection(dcc_mode=dcc_mode)

    fh = open(infile, 'r')
    fout = open(outfile, 'w')
    for line in fh:
        rec = line.strip("\n").split("\t")[0]
        if not rec or rec.startswith("#"):
            fout.write(line)
            continue
        rec = conn.get(rec_ids=rec, ignore404=False)
        aliases = rec["aliases"]
        for a in aliases:
            line = [line.strip("\n")]
            outline = line.extend(aliases)
            fout.write("\t".join(line) + "\n")
    fout.close()
    fh.close()
Exemplo n.º 2
0
 def test_dry_run_enabled(self):
     """
     Tests the method ``check_dry_run`` for returning True when the ``Connection`` class is
     instantiated in dry-run mode.
     """
     self.conn = Connection(eu.DCC_DEV_MODE, dry_run=True, no_log_file=True)
     self.assertEqual(True, self.conn.check_dry_run())
 def __init__(self, steps, metadata_json, server, lab, award):
     super(Accession, self).__init__()
     self.set_lab_award(lab, award)
     self.analysis = Analysis(metadata_json)
     self.steps_and_params_json = self.file_to_json(steps)
     self.backend = self.analysis.backend
     self.conn = Connection(server)
     self.new_files = []
     self.current_user = self.get_current_user()
Exemplo n.º 4
0
def main():
    """Program
    """
    EXP_PROFILE_ID = "experiment"
    FILE_PROFILE_ID = "file"
    VALID_PROFILES = [EXP_PROFILE_ID, FILE_PROFILE_ID]
    parser = get_parser()
    args = parser.parse_args()
    infile = args.infile
    outfile = args.outfile
    dcc_mode = args.dcc_mode

    conn = Connection(dcc_mode)

    fh = open(infile, 'r')
    fout = open(outfile, 'w')
    for line in fh:
        rec_id = line.strip("\n").split("\t")[0]
        if not rec_id or rec_id.startswith("#"):
            continue
        rec = conn.get(rec_id, ignore404=False)
        profile = conn.profiles.get_profile_from_id(rec["@id"])
        profile_id = profile.name
        if profile_id not in VALID_PROFILES:
            raise Exception(
                "Record identifier '{}' must be an identifer for an object of a type in the set {}."
                .format(rec_id, VALID_PROFILES))

        if profile_id == EXP_PROFILE_ID:
            # List of FASTQ file objects in JSON format.
            fastq_recs = conn.get_fastqfiles_on_exp(rec_id)
            exp_accession = rec["accession"]
        else:
            fastq_recs = [conn.get(rec_id, ignore404=False)]
            exp_accession = fastq_recs[0]["dataset"].split("/")[-1]
        for fq_rec in fastq_recs:
            status = fq_rec["status"]
            error_msg = ""
            if status == "content error":
                error_msg = fq_rec["content_error_detail"]
                fout.write("\t".join([exp_accession, rec_id, error_msg]) +
                           "\n")
    fout.close()
    fh.close()
Exemplo n.º 5
0
def main():
    """Program
    """
    parser = get_parser()
    args = parser.parse_args()
    infile = args.infile
    outfile = args.outfile
    dcc_mode = args.dcc_mode
    submitter_lab = args.submitter_lab
    if not submitter_lab:
        submitter_lab = encode_utils.LAB_PREFIX.rstrip(":")

    conn = Connection(dcc_mode=dcc_mode)

    fh = open(infile, 'r')
    fout = open(outfile, 'w')
    for line in fh:
        alias = line.strip("\n").split("\t")[0]
        if not alias or alias.startswith("#"):
            fout.write(line)
            continue
        alias_lab_prefix = alias.split(":", 1)
        try:
            lab_prefix, alias_name = alias.split(":", 1)
        except ValueError:
            if not submitter_lab:
                raise Exception(
                    "Unknown submitting lab name for alias {}. See description for --submitter-lab  argument."
                    .format(alias))
            alias = submitter_lab + ":" + alias
        rec = conn.get(rec_ids=alias, ignore404=False)
        try:
            dcc_id = rec["accession"]
        except KeyError:
            dcc_id = rec["uuid"]
        line = [line.strip("\n")]
        outline = line.append(dcc_id)
        fout.write("\t".join(line) + "\n")
    fout.close()
    fh.close()
Exemplo n.º 6
0
def main():
    """Program
    """
    parser = get_parser()
    args = parser.parse_args()
    infile = args.infile
    outfile = args.outfile
    dcc_mode = args.dcc_mode

    conn = Connection(dcc_mode)

    fh = open(infile, 'r')
    fout = open(outfile, 'w')
    for line in fh:
        rec_id = line.strip()
        if not rec_id or rec_id.startswith("#"):
            continue
        rec = conn.get(rec_id, ignore404=True)
        if not rec:
            print("'{}' not found.".format(rec_id))
            fout.write(rec_id + "\n")
    fout.close()
    fh.close()
Exemplo n.º 7
0
def main():
    """Program
    """
    parser = get_parser()
    args = parser.parse_args()
    mode = args.dcc_mode
    exp_id = args.exp_id
    bio_rep_num = args.bio_rep_num
    tech_rep_num = args.tech_rep_num

    conn = Connection(mode)
    rep_dico = conn.get_fastqfile_replicate_hash(exp_id)

    for b in rep_dico:
        if bio_rep_num and b != bio_rep_num:
            continue
        for t in rep_dico[b]:
            if tech_rep_num and t != tech_rep_num:
                continue
            for read_num in rep_dico[b][t]:
                for fastq_json in rep_dico[b][t][read_num]:
                    alias = fastq_json["aliases"][0]
                    print("_".join([str(b), str(t),
                                    str(read_num)]) + "\t" + alias)
Exemplo n.º 8
0
 def test_arbitrary_host(self):
     self.conn = Connection(dcc_mode='test.encodedcc.org', no_log_file=True)
Exemplo n.º 9
0
 def setUp(self):
     self.conn = Connection(eu.DCC_DEV_MODE, no_log_file=True)
Exemplo n.º 10
0
class TestConnection(unittest.TestCase):
    """Tests the ``encode_utils.connection.py`` module.
    """
    def setUp(self):
        self.conn = Connection(eu.DCC_DEV_MODE, no_log_file=True)

    def test_arbitrary_host(self):
        self.conn = Connection(dcc_mode='test.encodedcc.org', no_log_file=True)

    def test_before_file_post(self):
        """
        Tests the method ``before_file_post()`` for correctly setting the `md5sum` property of a
        file record.
        """
        payload = {
            self.conn.PROFILE_KEY:
            profiles.Profiles.FILE_PROFILE_ID,
            profiles.Profiles.SUBMITTED_FILE_PROP_NAME:
            os.path.join(DATA_DIR, "test_fq_40recs.fastq.gz")
        }
        res = self.conn.before_post_file(payload)
        self.assertEqual(res["md5sum"], "a3e7cb3df359d0642ab0edd33ea7e93e")

    def test_get_lookup_ids_from_payload(self):
        """
        Tests the method ``get_lookup_ids_from_payload()`` for returning the correct result when
        given a variaty of identifiers (accession, alias, and md5sum).
        """
        accession = "ENCSR502NRF"
        alias = "michael-snyder:SCGPM_SReq-1103_HG7CL_L3_GGCTAC_R1.fastq.gz"
        md5 = "3fef3e25315f105b944691668838b9b5"
        payload = {
            self.conn.ENCID_KEY: accession,
            "aliases": [alias],
            "md5sum": md5
        }

        res = self.conn.get_lookup_ids_from_payload(payload)
        self.assertEqual(sorted(res), sorted([accession, alias, md5]))

    def test_get_profile_from_payload(self):
        """
        Tests the method ``get_profile_from_payload()`` for returning the correct result when only the
        key ``encode_utils.connection.Connection.PROFILE_KEY`` is set in the payload.
        """
        # Use a valid profile ID that exists as a key in profiles.Profile.PROFILES.
        profile_id = "genetic_modification"
        payload = {}
        payload[self.conn.PROFILE_KEY] = "genetic_modification"
        res = self.conn.get_profile_from_payload(payload)
        self.assertEqual(res.name, profile_id)

    def test_2_get_profile_from_payload(self):
        """
        Tests the method ``get_profile_from_payload()`` for returning the correct result when only the
        key for the `@id` property is set in the payload.
        """
        # Use a valid profile ID that exists as a key in profiles.Profile.PROFILES.
        profile_id = "genetic_modification"
        payload = {}
        payload["@id"] = "genetic_modification"
        res = self.conn.get_profile_from_payload(payload)
        self.assertEqual(res.name, profile_id)

    def test_3_get_profile_from_payload(self):
        """
        Tests the method ``get_profile_from_payload()`` for raising the exception
        ``encode_utils.exceptions.ProfileNotSpecified`` when neither the ``self.PROFILE_KEY`` or `@id`
        key is present in the payload.
        """
        # Use a valid profile ID that exists as a key in profiles.Profile.PROFILES.
        payload = {}
        self.assertRaises(ProfileNotSpecified,
                          self.conn.get_profile_from_payload, payload)

    def test_4_get_profile_from_payload(self):
        """
        Tests the method ``get_profile_from_payload()`` for raising the exception
        ``profiles.UnknownProfile`` when an unknown profile is specified in the payload.
        """
        # Use a valid profile ID that exists as a key in profiles.Profile.PROFILES.
        payload = {}
        payload[self.conn.PROFILE_KEY] = "unknown_profile"
        self.assertRaises(profiles.UnknownProfile,
                          self.conn.get_profile_from_payload, payload)

    def test_extract_aws_upload_credentials(self):
        """
        Tests the ``method extract_aws_upload_credentials()`` for extracting the upload credentials
        for from a file object's JSON.
        """
        access_key = "access_key"
        secret_key = "secret_key"
        session_token = "session_token"
        upload_url = "upload_url"

        payload = {
            access_key: access_key,
            secret_key: secret_key,
            session_token: session_token,
            upload_url: upload_url
        }

        res = self.conn.extract_aws_upload_credentials(payload)

        aws_creds = {}
        aws_creds["AWS_ACCESS_KEY_ID"] = access_key
        aws_creds["AWS_SECRET_ACCESS_KEY"] = secret_key
        aws_creds["AWS_SESSION_TOKEN"] = session_token
        aws_creds["UPLOAD_URL"] = upload_url

        self.assertEqual(res, aws_creds)

    def test_make_search_url(self):
        """
        Tests the method ``make_search_url()`` for building the correct URL given the query arguments
        to find ChIP-seq assays performed on primary cells from blood.
        """
        query = {
            "assay_title": "ChIP-seq",
            "biosample_type": "primary cell",
            "organ_slims": "blood",
            "type": "Experiment"
        }

        res = self.conn.make_search_url(search_args=query)
        query = "search/?assay_title=ChIP-seq&biosample_type=primary+cell&organ_slims=blood&type=Experiment"
        self.assertEqual(res, os.path.join(self.conn.dcc_mode.url, query))

    def test_get(self):
        res = self.conn.get('experiments/ENCSR502NRF/', frame='object')
        self.assertEqual(res.get('uuid', ""),
                         "e44c59cc-f14a-4722-a9c5-2fe63c2b9533")

    def test_dry_run_enabled(self):
        """
        Tests the method ``check_dry_run`` for returning True when the ``Connection`` class is
        instantiated in dry-run mode.
        """
        self.conn = Connection(eu.DCC_DEV_MODE, dry_run=True, no_log_file=True)
        self.assertEqual(True, self.conn.check_dry_run())

    def test_bedfile_download(self):
        """
        Tests the method ``download`` for downloading a tiny BED file record (ENCFF815QOR) of size 44 KB. 
        in this directory.
        """
        filepath = self.conn.download(rec_id="ENCFF815QOR",
                                      directory=os.getcwd())
        self.assertTrue(os.stat(filepath).st_size > 0)

    def test_doc_download(self):
        """
        Tests the method ``download`` for downloading a document record (michael-snyder:P-17) in
        this directory.
        """
        filepath = self.conn.download(rec_id="michael-snyder:P-17",
                                      directory=os.getcwd())
        self.assertTrue(os.stat(filepath).st_size > 0)

    def test_autosql_attachment(self):
        """
        Tests the method ``set_attachment`` for autosql attachment.
        """
        encoded_uri = self.conn.set_attachment(
            os.path.join(DATA_DIR, "estarr_counts.as"))
        print(encoded_uri)
        self.assertTrue(encoded_uri['href'] == (
            'data:text/autosql;base64,'
            'dGFibGUgZXN0YXJyX2NvdW50cwoiZVNUQVJSIGNvdW50cyIKKApzdHJpbmcgS'
            'UQ7ICJDYW5kaWRhdGUgaWRlbnRpZmllciIKc3RyaW5nIERpcmVjdGlvbjsgIk'
            'Nsb25pbmcgZGlyZWN0aW9uIgp1aW50IFVNSV9jb3VudDsgIlVuaXF1ZSBNb2x'
            'lY3VsYXIgSWRlbnRpZmllciBjb3VudCIKKQ=='))
Exemplo n.º 11
0
def test_connection_dcc_mode_https_url(mocker):
    mocker.patch("requests.get")
    conn = Connection("https://www.foo.bar", no_log_file=True)
    assert conn.dcc_mode.url == "https://www.foo.bar"
Exemplo n.º 12
0
def main():
    """Program
    """
    parser = get_parser()
    args = parser.parse_args()
    dcc_mode = args.dcc_mode
    infile = args.infile
    protocol_uuid = args.protocol_uuid

    # connect to DCC
    conn = Connection(dcc_mode)

    barplot_description = "Barplot showing the expression of the given gene in the control vs. the treatment. Expression is given in Transcripts Per Million (TPM) and was generated by version 1.2.30 of RSEM's rsem-calculate-expression script."
    fh = open(infile, 'r')
    header = fh.readline().strip("\n")
    if not header.startswith("#"):
        raise Exception(
            "First line of input file must be a field-header line starting with a '#'."
        )
    dico = {
    }  # key: library accession, value: {barplot: local_barplot_path, line: line_from_input_file}
    # store a list of all exp IDs seen in input file so we can later link the
    # analysis protocol doc to the exp.
    exp_encids = []
    for line in fh:
        line = line.strip("\n")
        if not line.strip():
            continue
        line = line.split("\t")
        dcc_exp_id = line[0].strip()
        if dcc_exp_id not in exp_encids:
            exp_encids.append(dcc_exp_id)
        dcc_rep_id = line[1].strip()
        rep_json = conn.get(rep_id, ignore404=False)
        dcc_lib_id = rep_json["library"]["accession"]
        barplot = line[2].strip()
        dico[dcc_lib_id] = {"barplot": barplot, "line": line}
    fh.close()

    fout = open(OUTPUT_FILE, 'w')
    fout.write(header + "\tjpeg_dcc_uuid\n")
    count = 0
    for lib_id in dico:
        #  count += 1
        barplot = dico[lib_id]["barplot"]
        download_filename = lib_id + "_relative_knockdown.jpeg"
        # download_filename is the name the user will get when they downoad the
        # file from the ENCODE Portal.
        dcc_uuid = conn.post_document(download_filename=download_filename,
                                      document=barplot,
                                      document_type="data QA",
                                      document_description=barplot_description)
        line = dico[lib_id]["line"]
        line.append(dcc_uuid)
        fout.write("\t".join(line) + "\n")
        # link document to library
        conn.link_document(rec_id=lib_id, dcc_document_uuid=dcc_uuid)
    fout.close()

    print(
        "Linking RSEM analysis and plotting protocol document to each experiment"
    )
    for exp in exp_encids:
        conn.link_document(rec_id=exp, document_id=protocol_uuid)
Exemplo n.º 13
0
def main():
    """Program
    """
    parser = get_parser()
    args = parser.parse_args()
    mode = args.dcc_mode
    exp_id = args.exp_id

    conn = Connection(mode)
    exp_rep_dico = conn.get_fastqfile_replicate_hash(exp_id)
    exp_json = conn.get(exp_id, ignore404=True)
    controls = exp_json["possible_controls"]  # A list of dicts.

    # Populate a controls-lookup hash. The keys will be the ctl accessions. Each value will be
    # the replicates hash (return value of conn.get_fastqfile_replicate_hash().
    controls_hash = {}  # A dict of dicts.
    control_bio_rep_counts = []
    for c in controls:
        ctl_accession = c["accession"]
        controls_hash[ctl_accession] = {}
        ctl_rep_dico = conn.get_fastqfile_replicate_hash(ctl_accession)
        controls_hash[ctl_accession]["rep_dico"] = ctl_rep_dico
        control_bio_rep_counts.append(len(ctl_rep_dico.keys()))

    # Make sure that all control experiments have the same number of biological replicates. There are
    # no known rules to apply otherwise.
    if len(set(control_bio_rep_counts)) != 1:
        raise Exception(
            "The controls '{controls}' have different numbers of biological replicates from one another '{rep_nums}'."
            .format(controls=control_ids, rep_nums=control_bio_rep_counts))

    # Make sure that the number of control bio reps equals the number of experiment bio reps:
    exp_bio_rep_count = len(exp_rep_dico.keys())
    if exp_bio_rep_count != control_bio_rep_counts[0]:
        raise Exception(
            "The number of experiment replicates '{}' doesn't equal the number of control replicates '{}'."
            .format(exp_bio_rep_count, control_bio_rep_counts[0]))

    # Now we'll look at each bio rep on the experiment, in numerical order of
    # biological_replicate_number from least to greatest. We'll work our way all the down to the
    # FASTQ files and start populating the File.controlled_by property in the following manner:
    #
    #  For each control, we'll sort the replicates the same was as we did for the ones on the
    #  experiment, then for the replicate having the same ordinal index, we'll add the FASTQ File
    #  references.

    sorted_exp_bio_reps = sorted(exp_rep_dico)
    count = -1
    # And now for the nastiest for-loop I've ever written ... this should be cleaned up but the logic
    # is so rough to implement that it'll be ugly any way we look at it.
    for b in sorted_exp_bio_reps:  # biological_replicate_number
        count += 1
        for t in exp_rep_dico[b]:  # technical_replicate_number
            for read_num in exp_rep_dico[b][t]:
                for fastq_json in exp_rep_dico[b][t][read_num]:
                    exp_file_acc = fastq_json["accession"]
                    controlled_by = []
                    for c in controls_hash:
                        ctl_bio_rep_num = sorted(
                            controls_hash[c]["rep_dico"])[count]
                        ctl_tech_reps = controls_hash[c]["rep_dico"][
                            ctl_bio_rep_num]
                        for ctl_tech_rep_num in ctl_tech_reps:
                            for ctl_encff in ctl_tech_reps[ctl_tech_rep_num][
                                    read_num]:
                                controlled_by.append(ctl_encff["accession"])
                    conn.patch(
                        {
                            conn.ENCID_KEY: exp_file_acc,
                            "controlled_by": controlled_by
                        },
                        extend_array_values=False)
class Accession(object):
    """docstring for Accession"""
    def __init__(self, steps, metadata_json, server, lab, award):
        super(Accession, self).__init__()
        self.set_lab_award(lab, award)
        self.analysis = Analysis(metadata_json)
        self.steps_and_params_json = self.file_to_json(steps)
        self.backend = self.analysis.backend
        self.conn = Connection(server)
        self.new_files = []
        self.current_user = self.get_current_user()

    def set_lab_award(self, lab, award):
        global COMMON_METADATA
        COMMON_METADATA['lab'] = lab
        COMMON_METADATA['award'] = award

    def get_current_user(self):
        response = requests.get(self.conn.dcc_url + '/session-properties',
                                auth=self.conn.auth)
        if response.ok:
            user = response.json().get('user')
            if user:
                return user.get('@id')
            raise Exception('Authenticated user not found')
        else:
            raise Exception('Request to portal failed')

    def file_to_json(self, file):
        with open(file) as json_file:
            json_obj = json.load(json_file)
        return json_obj

    def file_to_json(self, file):
        with open(file) as json_file:
            json_obj = json.load(json_file)
        return json_obj

    def file_to_json(self, file):
        with open(file) as json_file:
            json_obj = json.load(json_file)
        return json_obj

    def accession_fastqs(self):
        pass

    def wait_for_portal(self):
        pass

    def file_at_portal(self, file):
        self.wait_for_portal()
        md5sum = self.backend.md5sum(file)
        search_param = [('md5sum', md5sum), ('type', 'File')]
        encode_file = self.conn.search(search_param)
        if len(encode_file) > 0:
            return self.conn.get(encode_file[0].get('accession'))

    def raw_fastq_inputs(self, file):
        if not file.task and 'fastqs' in file.filekeys:
            yield file
        if file.task:
            for input_file in file.task.input_files:
                yield from self.raw_fastq_inputs(input_file)

    def raw_files_accessioned(self):
        for file in self.analysis.raw_fastqs:
            if not self.file_at_portal(file.filename):
                return False
        return True

    def accession_file(self, encode_file, gs_file):
        file_exists = self.file_at_portal(gs_file.filename)
        submitted_file_path = {'submitted_file_name': gs_file.filename}
        if not file_exists:
            local_file = self.backend.download(gs_file.filename)[0]
            encode_file['submitted_file_name'] = local_file
            encode_posted_file = self.conn.post(encode_file)
            os.remove(local_file)
            encode_posted_file = self.patch_file(encode_posted_file,
                                                 submitted_file_path)
            self.new_files.append(encode_posted_file)
            return encode_posted_file
        elif (file_exists
              and file_exists.get('status') in ['deleted', 'revoked']):
            encode_file.update(submitted_file_path)
            # Update the file to current user
            # TODO: Reverse this when duplicate md5sums are enabled
            encode_file.update({'submitted_by': self.current_user})
            encode_patched_file = self.patch_file(file_exists, encode_file)
            self.new_files.append(encode_patched_file)
            return encode_patched_file
        return file_exists

    def patch_file(self, encode_file, new_properties):
        new_properties[self.conn.ENCID_KEY] = encode_file.get('accession')
        return self.conn.patch(new_properties, extend_array_values=False)

    def get_or_make_step_run(self, lab_prefix, run_name, step_version,
                             task_name):
        docker_tag = self.analysis.get_tasks(task_name)[0].docker_image.split(
            ':')[1]
        payload = {
            'aliases': ["{}:{}-{}".format(lab_prefix, run_name, docker_tag)],
            'status': 'released',
            'analysis_step_version': step_version
        }
        payload[Connection.PROFILE_KEY] = 'analysis_step_runs'
        print(payload)
        return self.conn.post(payload)

    @property
    def assembly(self):
        assembly = [
            reference for reference in ASSEMBLIES
            if reference in self.analysis.get_tasks('read_genome_tsv')
            [0].outputs.get('genome', {}).get('ref_fa', '')
        ]
        return assembly[0] if len(assembly) > 0 else ''

    @property
    def lab_pi(self):
        return COMMON_METADATA['lab'].split('/labs/')[1].split('/')[0]

    @property
    def dataset(self):
        return self.file_at_portal(
            self.analysis.raw_fastqs[0].filename).get('dataset')

    def file_from_template(self,
                           file,
                           file_format,
                           output_type,
                           step_run,
                           derived_from,
                           dataset,
                           file_format_type=None):
        file_name = file.filename.split('gs://')[-1].replace('/', '-')
        obj = {
            'status': 'uploading',
            'aliases': ['{}:{}'.format(self.lab_pi, file_name)],
            'file_format': file_format,
            'output_type': output_type,
            'assembly': self.assembly,
            'dataset': dataset,
            'step_run': step_run.get('@id'),
            'derived_from': derived_from,
            'file_size': file.size,
            'md5sum': file.md5sum
        }
        if file_format_type:
            obj['file_format_type'] = file_format_type
        obj[Connection.PROFILE_KEY] = 'file'
        obj.update(COMMON_METADATA)
        return obj

    def get_derived_from_all(self, file, files, inputs=False):
        ancestors = []
        for ancestor in files:
            ancestors.append(
                self.get_derived_from(file, ancestor.get('derived_from_task'),
                                      ancestor.get('derived_from_filekey'),
                                      ancestor.get('derived_from_output_type'),
                                      ancestor.get('derived_from_inputs')))
        return list(self.flatten(ancestors))

    def flatten(self, nested_list):
        if isinstance(nested_list, str):
            yield nested_list
        if isinstance(nested_list, list):
            for item in nested_list:
                yield from self.flatten(item)

    # Returns list of accession ids of files on portal or recently accessioned
    def get_derived_from(self,
                         file,
                         task_name,
                         filekey,
                         output_type=None,
                         inputs=False):
        derived_from_files = list(
            set(
                list(
                    self.analysis.search_up(file.task, task_name, filekey,
                                            inputs))))
        encode_files = [
            self.file_at_portal(gs_file.filename)
            for gs_file in derived_from_files
        ]
        accessioned_files = encode_files + self.new_files
        accessioned_files = [x for x in accessioned_files if x is not None]
        derived_from_accession_ids = []
        for gs_file in derived_from_files:
            for encode_file in accessioned_files:
                if gs_file.md5sum == encode_file.get('md5sum'):
                    # Optimal peaks can be mistaken for conservative peaks
                    # when their md5sum is the same
                    if output_type and output_type != encode_file.get(
                            'output_type'):
                        continue
                    derived_from_accession_ids.append(
                        encode_file.get('accession'))
        derived_from_accession_ids = list(set(derived_from_accession_ids))

        # Raise exception when some or all of the derived_from files
        # are missing from the portal
        if not derived_from_accession_ids:
            raise Exception(
                'Missing all of the derived_from files on the portal')
        if len(derived_from_accession_ids) != len(derived_from_files):
            raise Exception(
                'Missing some of the derived_from files on the portal')
        return [
            '/files/{}/'.format(accession_id)
            for accession_id in derived_from_accession_ids
        ]

    # File object to be accessioned
    # inputs=True will search for input fastqs in derived_from

    def make_file_obj(self,
                      file,
                      file_format,
                      output_type,
                      step_run,
                      derived_from_files,
                      file_format_type=None,
                      inputs=False):
        derived_from = self.get_derived_from_all(file, derived_from_files,
                                                 inputs)
        return self.file_from_template(file, file_format, output_type,
                                       step_run, derived_from, self.dataset,
                                       file_format_type)

    def get_bio_replicate(self, encode_file, string=True):
        replicate = encode_file.get('biological_replicates')[0]
        if string:
            return str(replicate)
        return int(replicate)

    def attach_idr_qc_to(self, encode_file, gs_file):
        if list(
                filter(lambda x: 'IDRQualityMetric' in x['@type'],
                       encode_file['quality_metrics'])):
            return
        qc = self.backend.read_json(self.analysis.get_files('qc_json')[0])
        idr_qc = qc['idr_frip_qc']
        replicate = self.get_bio_replicate(encode_file)
        rep_pr = idr_qc['rep' + replicate + '-pr']
        frip_score = rep_pr['FRiP']
        idr_peaks = qc['ataqc']['rep' + replicate]['IDR peaks'][0]
        step_run = encode_file.get('step_run')
        if isinstance(step_run, str):
            step_run_id = step_run
        elif isinstance(step_run, dict):
            step_run_id = step_run.get('@id')
        qc_object = {}
        qc_object['F1'] = frip_score
        qc_object['N1'] = idr_peaks
        idr_cutoff = self.analysis.metadata['inputs']['atac.idr_thresh']
        # Strongly expects that plot exists
        plot_png = next(
            self.analysis.search_up(gs_file.task, 'idr_pr', 'idr_plot'))
        qc_object.update({
            'step_run':
            step_run_id,
            'quality_metric_of': [encode_file.get('@id')],
            'IDR_cutoff':
            idr_cutoff,
            'status':
            'released',
            'IDR_plot_rep{}_pr'.format(replicate):
            self.get_attachment(plot_png, 'image/png')
        })
        qc_object.update(COMMON_METADATA)
        qc_object[Connection.PROFILE_KEY] = 'idr-quality-metrics'
        posted_qc = self.conn.post(qc_object, require_aliases=False)
        return posted_qc

    def attach_flagstat_qc_to(self, encode_bam_file, gs_file):
        # Return early if qc metric exists
        if list(
                filter(
                    lambda x: 'SamtoolsFlagstatsQualityMetric' in x['@type'],
                    encode_bam_file['quality_metrics'])):
            return
        qc = self.backend.read_json(self.analysis.get_files('qc_json')[0])
        replicate = self.get_bio_replicate(encode_bam_file)
        flagstat_qc = qc['nodup_flagstat_qc']['rep' + replicate]
        for key, value in flagstat_qc.items():
            if '_pct' in key:
                flagstat_qc[key] = '{}%'.format(value)
        step_run = encode_bam_file.get('step_run')
        if isinstance(step_run, str):
            step_run_id = step_run
        elif isinstance(step_run, dict):
            step_run_id = step_run.get('@id')
        flagstat_qc.update({
            'step_run': step_run_id,
            'quality_metric_of': [encode_bam_file.get('@id')],
            'status': 'released'
        })
        flagstat_qc.update(COMMON_METADATA)
        flagstat_qc[
            Connection.PROFILE_KEY] = 'samtools-flagstats-quality-metric'
        posted_qc = self.conn.post(flagstat_qc, require_aliases=False)
        return posted_qc

    def attach_cross_correlation_qc_to(self, encode_bam_file, gs_file):
        # Return early if qc metric exists
        if list(
                filter(lambda x: 'ComplexityXcorrQualityMetric' in x['@type'],
                       encode_bam_file['quality_metrics'])):
            return

        qc = self.backend.read_json(self.analysis.get_files('qc_json')[0])
        plot_pdf = next(
            self.analysis.search_down(gs_file.task, 'xcor', 'plot_pdf'))
        read_length_file = next(
            self.analysis.search_up(gs_file.task, 'bowtie2', 'read_len_log'))
        read_length = int(
            self.backend.read_file(read_length_file.filename).decode())
        replicate = self.get_bio_replicate(encode_bam_file)
        xcor_qc = qc['xcor_score']['rep' + replicate]
        pbc_qc = qc['pbc_qc']['rep' + replicate]
        step_run = encode_bam_file.get('step_run')
        if isinstance(step_run, str):
            step_run_id = step_run
        elif isinstance(step_run, dict):
            step_run_id = step_run.get('@id')

        xcor_object = {
            'NRF':
            pbc_qc['NRF'],
            'PBC1':
            pbc_qc['PBC1'],
            'PBC2':
            pbc_qc['PBC2'],
            'NSC':
            xcor_qc['NSC'],
            'RSC':
            xcor_qc['RSC'],
            'sample size':
            xcor_qc['num_reads'],
            "fragment length":
            xcor_qc['est_frag_len'],
            "quality_metric_of": [encode_bam_file.get('@id')],
            "step_run":
            step_run_id,
            "paired-end":
            self.analysis.metadata['inputs']['atac.paired_end'],
            "read length":
            read_length,
            "status":
            "released",
            "cross_correlation_plot":
            self.get_attachment(plot_pdf, 'application/pdf')
        }

        xcor_object.update(COMMON_METADATA)
        xcor_object[
            Connection.PROFILE_KEY] = 'complexity-xcorr-quality-metrics'
        posted_qc = self.conn.post(xcor_object, require_aliases=False)
        return posted_qc

    def file_has_qc(self, bam, qc):
        for item in bam['quality_metrics']:
            if item['@type'][0] == qc['@type'][0]:
                return True
        return False

    def get_attachment(self, gs_file, mime_type):
        contents = self.backend.read_file(gs_file.filename)
        contents = b64encode(contents)
        if type(contents) is bytes:
            # The Portal treats the contents as string "b'bytes'"
            contents = str(contents).replace('b', '', 1).replace('\'', '')
        obj = {
            'type': mime_type,
            'download': gs_file.filename.split('/')[-1],
            'href': 'data:{};base64,{}'.format(mime_type, contents)
        }
        return obj

    def accession_step(self, single_step_params):
        step_run = self.get_or_make_step_run(
            self.lab_pi, single_step_params['dcc_step_run'],
            single_step_params['dcc_step_version'],
            single_step_params['wdl_task_name'])
        accessioned_files = []
        for task in self.analysis.get_tasks(
                single_step_params['wdl_task_name']):
            for file_params in single_step_params['wdl_files']:
                for wdl_file in [
                        file for file in task.output_files
                        if file_params['filekey'] in file.filekeys
                ]:

                    # Conservative IDR thresholded peaks may have
                    # the same md5sum as optimal one
                    try:
                        obj = self.make_file_obj(
                            wdl_file,
                            file_params['file_format'],
                            file_params['output_type'],
                            step_run,
                            file_params['derived_from_files'],
                            file_format_type=file_params.get(
                                'file_format_type'))
                        encode_file = self.accession_file(obj, wdl_file)
                    except Exception as e:
                        if 'Conflict' in str(e) and file_params.get(
                                'possible_duplicate'):
                            continue
                        elif 'Missing all of the derived_from' in str(e):
                            continue
                        else:
                            raise

                    # Parameter file inputted assumes Accession implements
                    # the methods to attach the quality metrics
                    quality_metrics = file_params.get('quality_metrics', [])
                    for qc in quality_metrics:
                        qc_method = getattr(self, QC_MAP[qc])
                        # Pass encode file with
                        # calculated properties
                        qc_method(self.conn.get(encode_file.get('accession')),
                                  wdl_file)
                    accessioned_files.append(encode_file)
        return accessioned_files

    def accession_steps(self):
        for step in self.steps_and_params_json:
            self.accession_step(step)
Exemplo n.º 15
0
def _main():
    main_args = _parse_args()
    test_demo_url = os.environ['TEST_DEMO_URL']
    conn = Connection(test_demo_url, main_args.dry_run)
    conn.regenerate_aws_upload_creds(main_args.file_id)