Exemplo n.º 1
0
    def test_ordering_mismatch(self):
        """Makes sure that the order samples' keys are in does not affect the title chosen.

        Related: https://github.com/AlexsLemonade/refinebio/pull/304
        """
        experiment_accession_code = "E-TABM-38"

        samples_endpoint = SAMPLES_URL.format(experiment_accession_code)
        r = utils.requests_retry_session().get(samples_endpoint, timeout=60)
        json_samples = r.json()["experiment"]["sample"]
        flattened_json_samples = [
            utils.flatten(json_sample) for json_sample in json_samples
        ]

        SDRF_URL_TEMPLATE = "https://www.ebi.ac.uk/arrayexpress/files/{code}/{code}.sdrf.txt"
        sdrf_url = SDRF_URL_TEMPLATE.format(code=experiment_accession_code)
        parsed_samples = parse_sdrf(sdrf_url)

        title_field = determine_title_field(parsed_samples,
                                            flattened_json_samples)
        sdrf_samples = harmonize_all_samples(parsed_samples, title_field)
        json_titles = [
            extract_title(json_sample, title_field)
            for json_sample in flattened_json_samples
        ]

        # The titles won't match up if the order of the sample dicts
        # isn't corrected for, resulting in a KeyError being raised.
        # So if this doesn't raise a KeyError, then we're good.
        for title in json_titles:
            sdrf_samples[title]
Exemplo n.º 2
0
    def test_sdrf_harmony(self):
        """ Harmonize SDRF test"""

        metadata = parse_sdrf(
            "https://www.ebi.ac.uk/arrayexpress/files/E-MTAB-3050/E-MTAB-3050.sdrf.txt"
        )
        harmonized = harmonize_all_samples(metadata)

        title = "donor A islets RNA"
        self.assertTrue(title in harmonized.keys())
        self.assertTrue("sex" in harmonized[title].keys())
        self.assertTrue("female" == harmonized[title]["sex"])
        self.assertTrue("age" in harmonized[title].keys())
        self.assertTrue(54.0 == harmonized[title]["age"])
        self.assertTrue("specimen_part" in harmonized[title].keys())
        self.assertTrue("subject" in harmonized[title].keys())
        self.assertTrue("developmental_stage" in harmonized[title].keys())
Exemplo n.º 3
0
    def test_sra_lots(self):
        """
        Smoke tests a few SRA types
        """

        # These can be built via
        #    https://www.ncbi.nlm.nih.gov/sra
        # Searching for
        #    (human) NOT cluster_dbgap[PROP]
        # And then Sent To -> File -> Accession List
        lots = [
            "ERR188021",
            "ERR188022",
            "ERR205021",
            "ERR205022",
            "ERR205023",
            "SRR000001",  # Soft fail, bad platform
            "ERR1737666",
            "ERR030891",
            "ERR030892",
            "SRR1542948",
            "SRR1553477",
            "SRR1542330",
            "SRR1538698",
            "SRR1538760",
            "SRR1538866",
            "SRR1539218",
            "SRR1797277",
            "SRR1533126",
        ]
        for accession in lots:
            try:
                metadata = SraSurveyor.gather_all_metadata(accession)
                harmonized = harmonize_all_samples([metadata])
                self.assertIsNotNone(harmonized)
            except UnsupportedDataTypeError:
                continue
Exemplo n.º 4
0
    def test_sdrf_big(self):
        """ Tests lots of different cases for harmonization"""

        lots = [
            "E-GEOD-59071",
            "E-MTAB-2967",
            "E-GEOD-36807",
            "E-MTAB-184",
            "E-GEOD-22619",
            "E-GEOD-25220",
            "E-GEOD-24287",
            "E-GEOD-13367",
            "E-GEOD-4183",
            "E-GEOD-3365",
            "E-GEOD-57183",
            "E-GEOD-67596",
            "E-GEOD-58667",
            "E-GEOD-55319",
            "E-GEOD-26112",
            "E-GEOD-41831",
            "E-GEOD-41744",
            "E-GEOD-26554",
            "E-GEOD-20307",
            "E-GEOD-23687",
            "E-GEOD-21521",
            "E-GEOD-13501",
            "E-GEOD-13849",
            "E-GEOD-15645",
            "E-GEOD-11083",
            "E-GEOD-15083",
            "E-GEOD-11907",
            "E-GEOD-8650",
            "E-GEOD-7753",
            "E-GEOD-68004",
            "E-GEOD-63881",
            "E-GEOD-48498",
            "E-GEOD-16797",
            "E-MTAB-5542",
            "E-GEOD-81622",
            "E-GEOD-65391",
            "E-GEOD-72798",
            "E-GEOD-72747",
            "E-GEOD-78193",
            "E-GEOD-62764",
            "E-GEOD-45291",
            "E-GEOD-50772",
            "E-GEOD-61635",
            "E-GEOD-45923",
            "E-GEOD-49454",
            "E-GEOD-29536",
            "E-GEOD-52471",
            "E-GEOD-50635",
            "E-GEOD-37463",
            "E-GEOD-37460",
            "E-GEOD-37455",
            "E-GEOD-39088",
            "E-GEOD-32591",
            "E-GEOD-36941",
            "E-GEOD-32279",
            "E-GEOD-26975",
            "E-GEOD-24060",
            "E-GEOD-24706",
            "E-MEXP-1635",
            "E-MTAB-5262",
            "E-GEOD-72246",
            "E-GEOD-80047",
            "E-GEOD-69967",
            "E-GEOD-82140",
            "E-GEOD-75890",
            "E-GEOD-67853",
            "E-GEOD-75343",
            "E-GEOD-50614",
            "E-GEOD-57376",
            "E-GEOD-61281",
            "E-GEOD-47751",
            "E-GEOD-58121",
            "E-GEOD-51440",
            "E-GEOD-55201",
            "E-GEOD-53552",
            "E-GEOD-50790",
            "E-GEOD-47598",
            "E-GEOD-41664",
            "E-GEOD-41663",
            "E-GEOD-41662",
            "E-GEOD-34248",
            "E-GEOD-30999",
            "E-GEOD-31652",
            "E-GEOD-30768",
            "E-GEOD-27887",
            "E-GEOD-18948",
            "E-GEOD-11903",
            "E-GEOD-13355",
            "E-GEOD-2737",
            "E-GEOD-78068",
            "E-GEOD-74143",
            "E-GEOD-58795",
            "E-GEOD-48780",
            "E-GEOD-55584",
            "E-GEOD-55457",
            "E-GEOD-55235",
            "E-GEOD-35455",
            "E-GEOD-45867",
            "E-GEOD-30023",
            "E-GEOD-42296",
            "E-GEOD-39340",
            "E-GEOD-37107",
            "E-GEOD-33377",
            "E-MEXP-3390",
            "E-GEOD-25160",
            "E-GEOD-24742",
            "E-GEOD-15573",
            "E-MTAB-11",
            "E-GEOD-15258",
            "E-GEOD-15602",
            "E-GEOD-12021",
            "E-GEOD-8350",
            "E-GEOD-1402",
            "E-GEOD-56998",
            "E-GEOD-42832",
            "E-GEOD-37912",
            "E-GEOD-32887",
            "E-GEOD-19314",
            "E-GEOD-18781",
            "E-GEOD-16538",
            "E-GEOD-66795",
            "E-GEOD-40568",
            "E-MTAB-2073",
            "E-GEOD-51092",
            "E-GEOD-48378",
            "E-GEOD-40611",
            "E-GEOD-23117",
            "E-MEXP-1883",
            "E-GEOD-81292",
            "E-GEOD-76886",
            "E-GEOD-76809",
            "E-GEOD-65405",
            "E-GEOD-65336",
            "E-GEOD-58095",
            "E-MEXP-1214",
            "E-GEOD-48149",
            "E-MEXP-32",
            "E-GEOD-33463",
            "E-GEOD-32413",
            "E-GEOD-19617",
            "E-MTAB-1944",
            "E-GEOD-17114",
            "E-GEOD-44719",
        ]

        for accession in lots:
            metadata = parse_sdrf("https://www.ebi.ac.uk/arrayexpress/files/" +
                                  accession + "/" + accession + ".sdrf.txt")
            if not metadata:
                continue
            harmonized = harmonize_all_samples(metadata)
            self.assertIsNotNone(harmonized)
Exemplo n.º 5
0
    def create_samples_from_api(self, experiment: Experiment, platform_dict: Dict) -> List[Sample]:
        """Generates a Sample item for each sample in an AE experiment.

        There are many possible data situations for a sample:

            - If the sample only has raw data available:
                - If it is on a platform that we support:
                    Download this raw data and process it
                - If it is not on a platform we support:
                    Don't download anything, don't process anything
            - If the sample has both raw and derived data:
                - If the raw data is on a platform we support:
                    Download the raw data and process it, abandon the derived data
                - If the raw data is not on a platform we support
                    Download the derived data and no-op it, abandon the raw data
            - If the sample only has derived data:
                Download the derived data and no-op it.

        See an example at: https://www.ebi.ac.uk/arrayexpress/json/v3/experiments/E-MTAB-3050/samples
        """

        created_samples = []

        samples_endpoint = SAMPLES_URL.format(experiment.accession_code)
        r = utils.requests_retry_session().get(samples_endpoint, timeout=60)
        samples = r.json()["experiment"]["sample"]

        # The SDRF is the complete metadata record on a sample/property basis.
        # We run this through our harmonizer and then attach the properties
        # to our created samples.
        SDRF_URL_TEMPLATE = "https://www.ebi.ac.uk/arrayexpress/files/{code}/{code}.sdrf.txt"
        sdrf_url = SDRF_URL_TEMPLATE.format(code=experiment.accession_code)
        sdrf_samples = harmony.parse_sdrf(sdrf_url)

        title_field = harmony.determine_title_field(sdrf_samples, samples)
        harmonized_samples = harmony.harmonize_all_samples(sdrf_samples, title_field)

        # An experiment can have many samples
        for sample_data in samples:

            # For some reason, this sample has no files associated with it.
            if "file" not in sample_data or len(sample_data["file"]) == 0:
                continue

            # Each sample is given an experimenatlly-unique title.
            flat_sample = utils.flatten(sample_data)
            title = harmony.extract_title(flat_sample, title_field)

            # A sample may actually have many sub files.
            # If there is raw data, take that.
            # If not, take the derived.
            has_raw = False
            for sub_file in sample_data["file"]:

                # For ex: E-GEOD-15645
                if isinstance(sub_file["comment"], list):
                    sub_file_mod = sub_file
                    sub_file_mod["comment"] = sub_file["comment"][0]
                else:
                    sub_file_mod = sub_file

                # Some have the 'data' field, but not the actual data
                # Ex: E-GEOD-9656
                if (
                    sub_file_mod["type"] == "data"
                    and sub_file_mod["comment"].get("value", None) != None
                ):
                    has_raw = True

                # 'value' can be None, convert to an empty string to
                # make it easier to use.
                comment_value = sub_file_mod["comment"].get("value", "") or ""
                if "raw" in comment_value:
                    has_raw = True

            skip_sample = False
            for sub_file in sample_data["file"]:

                # Don't get the raw data if it's only a 1-color sample.
                if "Cy3" in str(sample_data) and "Cy5" not in str(sample_data):
                    has_raw = False

                # Skip derived data if we have it raw.
                if has_raw and "derived data" in sub_file["type"]:
                    continue

                download_url = None
                filename = sub_file["name"]

                # sub_file["comment"] is only a list if there's
                # more than one comment...
                comments = sub_file["comment"]
                if isinstance(comments, list):
                    # Could be: "Derived ArrayExpress Data Matrix FTP
                    # file" or: "ArrayExpress FTP file". If there is
                    # no comment with a name including "FTP file" then
                    # we don't know where to download it so we need to
                    # mark this job as an error. Therefore don't catch
                    # the potential exception where download_url
                    # doesn't get defined.
                    for comment in comments:
                        if "FTP file" in comment["name"]:
                            download_url = comment["value"]
                            break
                else:
                    download_url = comments["value"]

                if not download_url:
                    logger.error(
                        "Sample %s did not specify a download url, skipping.",
                        sample_accession_code,
                        experiment_accession_code=experiment.accession_code,
                        survey_job=self.survey_job.id,
                        sub_file=sub_file,
                    )
                    skip_sample = True
                    continue

                if not filename:
                    logger.error(
                        "Sample %s did not specify a filename, skipping.",
                        sample_accession_code,
                        experiment_accession_code=experiment.accession_code,
                        survey_job=self.survey_job.id,
                        sub_file=sub_file,
                    )
                    skip_sample = True
                    continue

            if skip_sample:
                continue

            # The accession code is not a simple matter to determine.
            sample_source_name = sample_data["source"].get("name", "")
            sample_assay_name = sample_data["assay"].get("name", "")
            sample_accession_code = self.determine_sample_accession(
                experiment.accession_code, sample_source_name, sample_assay_name, filename
            )

            # Figure out the Organism for this sample
            organism_name = UNKNOWN
            for characteristic in sample_data["characteristic"]:
                if characteristic["category"].upper() == "ORGANISM":
                    organism_name = characteristic["value"].upper()

            if organism_name == UNKNOWN:
                logger.error(
                    "Sample %s did not specify the organism name.",
                    sample_accession_code,
                    experiment_accession_code=experiment.accession_code,
                    survey_job=self.survey_job.id,
                )
                organism = None
                continue
            else:
                organism = Organism.get_object_for_name(organism_name)

            # Create the sample object
            try:
                # Associate it with the experiment, but since it
                # already exists it already has original files
                # associated with it and it's already been downloaded,
                # so don't add it to created_samples.
                sample_object = Sample.objects.get(accession_code=sample_accession_code)

                # If input experiment includes new protocol information,
                # update sample's protocol_info.
                existing_protocols = sample_object.protocol_info
                protocol_info, is_updated = self.update_sample_protocol_info(
                    existing_protocols,
                    experiment.protocol_description,
                    experiment.source_url + "/protocols",
                )
                if is_updated:
                    sample_object.protocol_info = protocol_info
                    sample_object.save()

                logger.debug(
                    "Sample %s already exists, skipping object creation.",
                    sample_accession_code,
                    experiment_accession_code=experiment.accession_code,
                    survey_job=self.survey_job.id,
                )
            except Sample.DoesNotExist:
                sample_object = Sample()

                # The basics
                sample_object.source_database = "ARRAY_EXPRESS"
                sample_object.title = title
                sample_object.accession_code = sample_accession_code
                sample_object.source_archive_url = samples_endpoint
                sample_object.organism = organism
                sample_object.platform_name = platform_dict["platform_accession_name"]
                sample_object.platform_accession_code = platform_dict["platform_accession_code"]
                sample_object.manufacturer = platform_dict["manufacturer"]
                sample_object.technology = "MICROARRAY"

                protocol_info, is_updated = self.update_sample_protocol_info(
                    existing_protocols=[],
                    experiment_protocol=experiment.protocol_description,
                    protocol_url=experiment.source_url + "/protocols",
                )
                # Do not check is_updated the first time because we must
                # save a list so we can append to it later.
                sample_object.protocol_info = protocol_info

                sample_object.save()

                # Directly assign the harmonized properties
                harmonized_sample = harmonized_samples[title]
                ArrayExpressSurveyor._apply_harmonized_metadata_to_sample(
                    sample_object, harmonized_sample
                )

                sample_annotation = SampleAnnotation()
                sample_annotation.name = "raw_metadata"
                sample_annotation.data = sample_data
                sample_annotation.sample = sample_object
                sample_annotation.is_ccdl = False
                sample_annotation.save()

                original_file = OriginalFile()
                original_file.filename = filename
                original_file.source_filename = filename
                original_file.source_url = download_url
                original_file.is_downloaded = False
                original_file.is_archive = True
                original_file.has_raw = has_raw
                original_file.save()

                original_file_sample_association = OriginalFileSampleAssociation()
                original_file_sample_association.original_file = original_file
                original_file_sample_association.sample = sample_object
                original_file_sample_association.save()

                created_samples.append(sample_object)

                logger.debug(
                    "Created " + str(sample_object),
                    experiment_accession_code=experiment.accession_code,
                    survey_job=self.survey_job.id,
                    sample=sample_object.id,
                )

            # Create associations if they don't already exist
            ExperimentSampleAssociation.objects.get_or_create(
                experiment=experiment, sample=sample_object
            )

            ExperimentOrganismAssociation.objects.get_or_create(
                experiment=experiment, organism=organism
            )

        return created_samples