示例#1
0
文件: geo.py 项目: erflynn/refinebio
    def _apply_metadata_to_experiment(experiment: Experiment, gse):
        """ Gets the metadata out of gse and applies it to the experiment"""
        experiment.source_url = (
            "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=" +
            experiment.accession_code)
        experiment.source_database = "GEO"
        experiment.title = gse.metadata.get("title", [""])[0]
        experiment.description = gse.metadata.get("summary", [""])[0]

        # Source doesn't provide time information, assume midnight.
        submission_date = gse.metadata["submission_date"][0] + " 00:00:00 UTC"
        experiment.source_first_published = dateutil.parser.parse(
            submission_date)
        last_updated_date = gse.metadata["last_update_date"][
            0] + " 00:00:00 UTC"
        experiment.source_last_updated = dateutil.parser.parse(
            last_updated_date)

        unique_institutions = list(set(gse.metadata["contact_institute"]))
        experiment.submitter_institution = ", ".join(unique_institutions)
        experiment.pubmed_id = gse.metadata.get("pubmed_id", [""])[0]

        # Scrape publication title and authorship from Pubmed
        if experiment.pubmed_id:
            pubmed_metadata = utils.get_title_and_authors_for_pubmed_id(
                experiment.pubmed_id)
            experiment.publication_title = pubmed_metadata[0]
            experiment.publication_authors = pubmed_metadata[1]
示例#2
0
 def test_get_pubmed_id_title(self):
     """ We scrape PMIDs now. """
     resp = get_title_and_authors_for_pubmed_id("22367537")
     self.assertEqual(
         resp[0],
         "Sequencing of neuroblastoma identifies chromothripsis and defects in neuritogenesis genes.",
     )
     self.assertEqual(
         resp[1],
         [
             "Molenaar JJ",
             "Koster J",
             "Zwijnenburg DA",
             "van Sluis P",
             "Valentijn LJ",
             "van der Ploeg I",
             "Hamdi M",
             "van Nes J",
             "Westerman BA",
             "van Arkel J",
             "Ebus ME",
             "Haneveld F",
             "Lakeman A",
             "Schild L",
             "Molenaar P",
             "Stroeken P",
             "van Noesel MM",
             "Ora I",
             "Santo EE",
             "Caron HN",
             "Westerhout EM",
             "Versteeg R",
         ],
     )
示例#3
0
    def _apply_metadata_to_experiment(experiment: Experiment, metadata: dict):
        experiment.source_url = ENA_URL_TEMPLATE.format(
            experiment.accession_code)
        experiment.source_database = "SRA"
        experiment.technology = "RNA-SEQ"

        # We don't get this value from the API, unfortunately.
        # experiment.platform_accession_code = experiment["platform_accession_code"]

        if not experiment.description:
            experiment.description = "No description."

        if "study_title" in metadata:
            experiment.title = metadata["study_title"]
        if "study_abstract" in metadata:
            experiment.description = metadata["study_abstract"]
        if "lab_name" in metadata:
            experiment.submitter_institution = metadata["lab_name"]
        if "experiment_design_description" in metadata:
            experiment.protocol_description = metadata[
                "experiment_design_description"]
        if "pubmed_id" in metadata:
            experiment.pubmed_id = metadata["pubmed_id"]
            experiment.has_publication = True
        if "study_ena_first_public" in metadata:
            experiment.source_first_published = parse_date(
                metadata["study_ena_first_public"])
        if "study_ena_last_update" in metadata:
            experiment.source_last_modified = parse_date(
                metadata["study_ena_last_update"])

        # We only want GEO alternate accessions for SRA samples
        if re.match(r"^GSE\d{2,6}", metadata.get("external_id",
                                                 "")) is not None:
            experiment.alternate_accession_code = metadata["external_id"]

        # Rare, but it happens.
        if not experiment.protocol_description:
            # metadata.get() doesn't work here because sometimes the
            # key is present but its value is None, in which case None
            # is returned, causing our database constraint to be
            # violated.
            if ("library_construction_protocol" in metadata
                    and metadata["library_construction_protocol"]):
                experiment.protocol_description = metadata[
                    "library_construction_protocol"]
            else:
                experiment.protocol_description = "Protocol was never provided."

        # Scrape publication title and authorship from Pubmed
        if experiment.pubmed_id:
            pubmed_metadata = utils.get_title_and_authors_for_pubmed_id(
                experiment.pubmed_id)
            experiment.publication_title = pubmed_metadata[0]
            experiment.publication_authors = pubmed_metadata[1]
示例#4
0
 def test_get_pubmed_id_title(self):
     """ We scrape PMIDs now. """
     resp = get_title_and_authors_for_pubmed_id("22367537")
     self.assertEqual(
         resp[0],
         'Sequencing of neuroblastoma identifies chromothripsis and defects in neuritogenesis genes.'
     )
     self.assertEqual(resp[1], [
         'Molenaar JJ', 'Koster J', 'Zwijnenburg DA', 'van Sluis P',
         'Valentijn LJ', 'van der Ploeg I', 'Hamdi M', 'van Nes J',
         'Westerman BA', 'van Arkel J', 'Ebus ME', 'Haneveld F',
         'Lakeman A', 'Schild L', 'Molenaar P', 'Stroeken P',
         'van Noesel MM', 'Ora I', 'Santo EE', 'Caron HN', 'Westerhout EM',
         'Versteeg R'
     ])
示例#5
0
    def create_experiment_from_api(
            self, experiment_accession_code: str) -> (Experiment, Dict):
        """Given an experiment accession code, create an Experiment object.

        Also returns a dictionary of additional information about the
        platform discovered for the experiment.

        Will raise an UnsupportedPlatformException if this experiment was
        conducted using a platform which we don't support.

        See an example at: https://www.ebi.ac.uk/arrayexpress/json/v3/experiments/E-MTAB-3050/sample
        """
        request_url = EXPERIMENTS_URL + experiment_accession_code
        experiment_request = utils.requests_retry_session().get(request_url,
                                                                timeout=60)

        try:
            parsed_json = experiment_request.json(
            )["experiments"]["experiment"][0]
        except KeyError:
            logger.error("Remote experiment has no Experiment data!",
                         experiment_accession_code=experiment_accession_code,
                         survey_job=self.survey_job.id)
            raise

        experiment = {}
        experiment["name"] = parsed_json["name"]
        experiment["experiment_accession_code"] = experiment_accession_code

        # This experiment has no platform at all, and is therefore useless.
        if 'arraydesign' not in parsed_json or len(
                parsed_json["arraydesign"]) == 0:
            logger.warn("Remote experiment has no arraydesign listed.",
                        experiment_accession_code=experiment_accession_code,
                        survey_job=self.survey_job.id)
            raise UnsupportedPlatformException
        # If there is more than one arraydesign listed in the experiment
        # then there is no other way to determine which array was used
        # for which sample other than looking at the header of the CEL
        # file. That obviously cannot happen until the CEL file has been
        # downloaded so we can just mark it as UNKNOWN and let the
        # downloader inspect the downloaded file to determine the
        # array then.
        elif len(parsed_json["arraydesign"]
                 ) != 1 or "accession" not in parsed_json["arraydesign"][0]:
            experiment["platform_accession_code"] = UNKNOWN
            experiment["platform_accession_name"] = UNKNOWN
            experiment["manufacturer"] = UNKNOWN
        else:
            external_accession = parsed_json["arraydesign"][0]["accession"]
            for platform in get_supported_microarray_platforms():
                if platform["external_accession"] == external_accession:
                    experiment[
                        "platform_accession_code"] = get_normalized_platform(
                            platform["platform_accession"])

                    # Illumina appears in the accession codes for
                    # platforms manufactured by Illumina
                    if "ILLUMINA" in experiment[
                            "platform_accession_code"].upper():
                        experiment["manufacturer"] = "ILLUMINA"
                        experiment["platform_accession_name"] = platform[
                            "platform_accession"]
                    else:
                        # It's not Illumina, the only other supported Microarray platform is
                        # Affy. As our list of supported platforms grows this logic will
                        # need to get more sophisticated.
                        experiment["manufacturer"] = "AFFYMETRIX"
                        platform_mapping = get_readable_affymetrix_names()
                        experiment[
                            "platform_accession_name"] = platform_mapping[
                                platform["platform_accession"]]

            if "platform_accession_code" not in experiment:
                # We don't know what platform this accession corresponds to.
                experiment["platform_accession_code"] = external_accession
                experiment["platform_accession_name"] = UNKNOWN
                experiment["manufacturer"] = UNKNOWN

        experiment["release_date"] = parsed_json["releasedate"]

        if "lastupdatedate" in parsed_json:
            experiment["last_update_date"] = parsed_json["lastupdatedate"]
        else:
            experiment["last_update_date"] = parsed_json["releasedate"]

        # Create the experiment object
        try:
            experiment_object = Experiment.objects.get(
                accession_code=experiment_accession_code)
            logger.debug(
                "Experiment already exists, skipping object creation.",
                experiment_accession_code=experiment_accession_code,
                survey_job=self.survey_job.id)
        except Experiment.DoesNotExist:
            # We aren't sure these fields will be populated, or how many there will be.
            # Try to join them all together, or set a sensible default.
            experiment_descripton = ""
            if "description" in parsed_json and len(
                    parsed_json["description"]) > 0:
                for description_item in parsed_json["description"]:
                    if "text" in description_item:
                        experiment_descripton = experiment_descripton + description_item[
                            "text"] + "\n"

            if experiment_descripton == "":
                experiment_descripton = "Description not available.\n"

            experiment_object = Experiment()
            experiment_object.accession_code = experiment_accession_code
            experiment_object.source_url = request_url
            experiment_object.source_database = "ARRAY_EXPRESS"
            experiment_object.title = parsed_json["name"]
            # This will need to be updated if we ever use Array
            # Express to get other kinds of data.
            experiment_object.technology = "MICROARRAY"
            experiment_object.description = experiment_descripton
            experiment_object.source_first_published = parse_datetime(
                experiment["release_date"])
            experiment_object.source_last_modified = parse_datetime(
                experiment["last_update_date"])
            experiment_object.save()

            json_xa = ExperimentAnnotation()
            json_xa.experiment = experiment_object
            json_xa.data = parsed_json
            json_xa.is_ccdl = False
            json_xa.save()

            ## Fetch and parse the IDF/SDRF file for any other fields
            IDF_URL_TEMPLATE = "https://www.ebi.ac.uk/arrayexpress/files/{code}/{code}.idf.txt"
            idf_url = IDF_URL_TEMPLATE.format(code=experiment_accession_code)
            idf_text = utils.requests_retry_session().get(idf_url,
                                                          timeout=60).text

            lines = idf_text.split('\n')
            idf_dict = {}
            for line in lines:
                keyval = line.strip().split('\t')
                if len(keyval) == 2:
                    idf_dict[keyval[0]] = keyval[1]
                elif len(keyval) > 2:
                    idf_dict[keyval[0]] = keyval[1:]

            idf_xa = ExperimentAnnotation()
            idf_xa.data = idf_dict
            idf_xa.experiment = experiment_object
            idf_xa.is_ccdl = False
            idf_xa.save()

            if 'Investigation Title' in idf_dict:
                experiment_object.title = idf_dict['Investigation Title']
            if 'Person Affiliation' in idf_dict:
                # This is very rare, ex: E-MEXP-32
                if isinstance(idf_dict['Person Affiliation'], list):

                    unique_people = list(set(idf_dict['Person Affiliation']))
                    experiment_object.submitter_institution = ", ".join(
                        unique_people)[:255]
                else:
                    experiment_object.submitter_institution = idf_dict[
                        'Person Affiliation']

            # Get protocol_description from "<experiment_url>/protocols"
            # instead of from idf_dict, because the former provides more
            # details.
            protocol_url = request_url + '/protocols'
            protocol_request = utils.requests_retry_session().get(protocol_url,
                                                                  timeout=60)
            try:
                experiment_object.protocol_description = protocol_request.json(
                )['protocols']
            except KeyError:
                logger.warning(
                    "Remote experiment has no protocol data!",
                    experiment_accession_code=experiment_accession_code,
                    survey_job=self.survey_job.id)

            if 'Publication Title' in idf_dict:
                # This will happen for some superseries.
                # Ex: E-GEOD-29536
                # Assume most recent is "best:, store the rest in experiment annotation.
                if isinstance(idf_dict['Publication Title'], list):
                    experiment_object.publication_title = "; ".join(
                        idf_dict['Publication Title'])
                else:
                    experiment_object.publication_title = idf_dict[
                        'Publication Title']
                experiment_object.has_publication = True
            if 'Publication DOI' in idf_dict:
                if isinstance(idf_dict['Publication DOI'], list):
                    experiment_object.publication_doi = ", ".join(
                        idf_dict['Publication DOI'])
                else:
                    experiment_object.publication_doi = idf_dict[
                        'Publication DOI']
                experiment_object.has_publication = True
            if 'PubMed ID' in idf_dict:
                if isinstance(idf_dict['PubMed ID'], list):
                    experiment_object.pubmed_id = ", ".join(
                        idf_dict['PubMed ID'])
                else:
                    experiment_object.pubmed_id = idf_dict['PubMed ID']
                experiment_object.has_publication = True

            # Scrape publication title and authorship from Pubmed
            if experiment_object.pubmed_id:
                pubmed_metadata = utils.get_title_and_authors_for_pubmed_id(
                    experiment_object.pubmed_id)
                experiment_object.publication_title = pubmed_metadata[0]
                experiment_object.publication_authors = pubmed_metadata[1]

            experiment_object.save()

        platform_dict = {}
        for k in ('platform_accession_code', 'platform_accession_name',
                  'manufacturer'):
            platform_dict[k] = experiment[k]

        return experiment_object, platform_dict
示例#6
0
    def _generate_experiment_and_samples(
            self,
            run_accession: str,
            study_accession: str = None) -> (Experiment, List[Sample]):
        """Generates Experiments and Samples for the provided run_accession."""
        metadata = SraSurveyor.gather_all_metadata(run_accession)

        if metadata == {}:
            if study_accession:
                logger.error("Could not discover any metadata for run.",
                             accession=run_accession,
                             study_accession=study_accession)
            else:
                logger.error("Could not discover any metadata for run.",
                             accession=run_accession)
            return (None, None)  # This will cascade properly

        if DOWNLOAD_SOURCE == "ENA":
            if metadata["library_layout"] == "PAIRED":
                files_urls = [
                    SraSurveyor._build_ena_file_url(run_accession, "_1"),
                    SraSurveyor._build_ena_file_url(run_accession, "_2")
                ]
            else:
                files_urls = [SraSurveyor._build_ena_file_url(run_accession)]
        else:
            files_urls = [SraSurveyor._build_ncbi_file_url(run_accession)]

        # Figure out the Organism for this sample
        organism_name = metadata.pop("organism_name", None)
        if not organism_name:
            logger.error("Could not discover organism type for run.",
                         accession=run_accession)
            return (None, None)  # This will cascade properly

        organism_name = organism_name.upper()
        organism = Organism.get_object_for_name(organism_name)

        ##
        # Experiment
        ##

        experiment_accession_code = metadata.get('study_accession')
        try:
            experiment_object = Experiment.objects.get(
                accession_code=experiment_accession_code)
            logger.debug(
                "Experiment already exists, skipping object creation.",
                experiment_accession_code=experiment_accession_code,
                survey_job=self.survey_job.id)
        except Experiment.DoesNotExist:
            experiment_object = Experiment()
            experiment_object.accession_code = experiment_accession_code
            experiment_object.source_url = ENA_URL_TEMPLATE.format(
                experiment_accession_code)
            experiment_object.source_database = "SRA"
            experiment_object.technology = "RNA-SEQ"

            # We don't get this value from the API, unfortunately.
            # experiment_object.platform_accession_code = experiment["platform_accession_code"]

            if not experiment_object.description:
                experiment_object.description = "No description."

            if "study_title" in metadata:
                experiment_object.title = metadata["study_title"]
            if "study_abstract" in metadata:
                experiment_object.description = metadata["study_abstract"]
            if "lab_name" in metadata:
                experiment_object.submitter_institution = metadata["lab_name"]
            if "experiment_design_description" in metadata:
                experiment_object.protocol_description = metadata[
                    "experiment_design_description"]
            if "pubmed_id" in metadata:
                experiment_object.pubmed_id = metadata["pubmed_id"]
                experiment_object.has_publication = True
            if "study_ena_first_public" in metadata:
                experiment_object.source_first_published = parse_datetime(
                    metadata["study_ena_first_public"])
            if "study_ena_last_update" in metadata:
                experiment_object.source_last_modified = parse_datetime(
                    metadata["study_ena_last_update"])

            # Rare, but it happens.
            if not experiment_object.protocol_description:
                experiment_object.protocol_description = metadata.get(
                    "library_construction_protocol",
                    "Protocol was never provided.")
            # Scrape publication title and authorship from Pubmed
            if experiment_object.pubmed_id:
                pubmed_metadata = utils.get_title_and_authors_for_pubmed_id(
                    experiment_object.pubmed_id)
                experiment_object.publication_title = pubmed_metadata[0]
                experiment_object.publication_authors = pubmed_metadata[1]

            experiment_object.save()

            ##
            # Experiment Metadata
            ##
            json_xa = ExperimentAnnotation()
            json_xa.experiment = experiment_object
            json_xa.data = metadata
            json_xa.is_ccdl = False
            json_xa.save()

        ##
        # Samples
        ##

        sample_accession_code = metadata.pop('run_accession')
        # Create the sample object
        try:
            sample_object = Sample.objects.get(
                accession_code=sample_accession_code)
            # If current experiment includes new protocol information,
            # merge it into the sample's existing protocol_info.
            protocol_info, is_updated = self.update_sample_protocol_info(
                sample_object.protocol_info,
                experiment_object.protocol_description,
                experiment_object.source_url)
            if is_updated:
                sample_object.protocol_info = protocol_info
                sample_object.save()

            logger.debug(
                "Sample %s already exists, skipping object creation.",
                sample_accession_code,
                experiment_accession_code=experiment_object.accession_code,
                survey_job=self.survey_job.id)
        except Sample.DoesNotExist:
            sample_object = Sample()
            sample_object.source_database = "SRA"
            sample_object.accession_code = sample_accession_code
            sample_object.organism = organism

            sample_object.platform_name = metadata.get(
                "platform_instrument_model", "UNKNOWN")
            # The platform_name is human readable and contains spaces,
            # accession codes shouldn't have spaces though:
            sample_object.platform_accession_code = sample_object.platform_name.replace(
                " ", "")
            sample_object.technology = "RNA-SEQ"
            if "ILLUMINA" in sample_object.platform_name.upper() \
            or "NEXTSEQ" in sample_object.platform_name.upper():
                sample_object.manufacturer = "ILLUMINA"
            elif "ION TORRENT" in sample_object.platform_name.upper():
                sample_object.manufacturer = "ION_TORRENT"
            else:
                sample_object.manufacturer = "UNKNOWN"

            # Directly apply the harmonized values
            sample_object.title = harmony.extract_title(metadata)
            harmonized_sample = harmony.harmonize([metadata])
            for key, value in harmonized_sample.items():
                setattr(sample_object, key, value)

            protocol_info, is_updated = self.update_sample_protocol_info(
                existing_protocols=[],
                experiment_protocol=experiment_object.protocol_description,
                experiment_url=experiment_object.source_url)
            # Do not check is_updated the first time because we must
            # save a list so we can append to it later.
            sample_object.protocol_info = protocol_info

            sample_object.save()

            for file_url in files_urls:
                original_file = OriginalFile.objects.get_or_create(
                    source_url=file_url,
                    source_filename=file_url.split('/')[-1],
                    has_raw=True)[0]
                original_file_sample_association = OriginalFileSampleAssociation.objects.get_or_create(
                    original_file=original_file, sample=sample_object)

        # Create associations if they don't already exist
        ExperimentSampleAssociation.objects.get_or_create(
            experiment=experiment_object, sample=sample_object)

        ExperimentOrganismAssociation.objects.get_or_create(
            experiment=experiment_object, organism=organism)

        return experiment_object, [sample_object]
示例#7
0
    def create_experiment_and_samples_from_api(
            self, experiment_accession_code) -> (Experiment, List[Sample]):
        """ The main surveyor - find the Experiment and Samples from NCBI GEO.

        Uses the GEOParse library, for which docs can be found here: https://geoparse.readthedocs.io/en/latest/usage.html#working-with-geo-objects

        """
        # Cleaning up is tracked here: https://github.com/guma44/GEOparse/issues/41
        gse = GEOparse.get_GEO(experiment_accession_code,
                               destdir=self.get_temp_path(),
                               how="brief",
                               silent=True)
        preprocessed_samples = harmony.preprocess_geo(gse.gsms.items())
        harmonized_samples = harmony.harmonize(preprocessed_samples)

        # Create the experiment object
        try:
            experiment_object = Experiment.objects.get(
                accession_code=experiment_accession_code)
            logger.debug(
                "Experiment %s already exists, skipping object creation.",
                experiment_accession_code,
                survey_job=self.survey_job.id)
        except Experiment.DoesNotExist:
            experiment_object = Experiment()
            experiment_object.accession_code = experiment_accession_code
            experiment_object.source_url = (
                "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=" +
                experiment_accession_code)
            experiment_object.source_database = "GEO"
            experiment_object.title = gse.metadata.get('title', [''])[0]
            experiment_object.description = gse.metadata.get('summary',
                                                             [''])[0]

            # Source doesn't provide time information, assume midnight.
            submission_date = gse.metadata["submission_date"][
                0] + " 00:00:00 UTC"
            experiment_object.source_first_published = dateutil.parser.parse(
                submission_date)
            last_updated_date = gse.metadata["last_update_date"][
                0] + " 00:00:00 UTC"
            experiment_object.source_last_updated = dateutil.parser.parse(
                last_updated_date)

            unique_institutions = list(set(gse.metadata["contact_institute"]))
            experiment_object.submitter_institution = ", ".join(
                unique_institutions)
            experiment_object.pubmed_id = gse.metadata.get("pubmed_id",
                                                           [""])[0]

            # Scrape publication title and authorship from Pubmed
            if experiment_object.pubmed_id:
                pubmed_metadata = utils.get_title_and_authors_for_pubmed_id(
                    experiment_object.pubmed_id)
                experiment_object.publication_title = pubmed_metadata[0]
                experiment_object.publication_authors = pubmed_metadata[1]

            experiment_object.save()

            experiment_annotation = ExperimentAnnotation()
            experiment_annotation.data = gse.metadata
            experiment_annotation.experiment = experiment_object
            experiment_annotation.is_ccdl = False
            experiment_annotation.save()

        # Okay, here's the situation!
        # Sometimes, samples have a direct single representation for themselves.
        # Othertimes, there is a single file with references to every sample in it.
        created_samples = []
        for sample_accession_code, sample in gse.gsms.items():

            try:
                sample_object = Sample.objects.get(
                    accession_code=sample_accession_code)
                logger.debug(
                    "Sample %s from experiment %s already exists, skipping object creation.",
                    sample_accession_code,
                    experiment_object.accession_code,
                    survey_job=self.survey_job.id)

                # Associate it with the experiment, but since it
                # already exists it already has original files
                # associated with it and it's already been downloaded,
                # so don't add it to created_samples.
                ExperimentSampleAssociation.objects.get_or_create(
                    experiment=experiment_object, sample=sample_object)

                ExperimentOrganismAssociation.objects.get_or_create(
                    experiment=experiment_object,
                    organism=sample_object.organism)
            except Sample.DoesNotExist:
                organism = Organism.get_object_for_name(
                    sample.metadata['organism_ch1'][0].upper())

                sample_object = Sample()
                sample_object.source_database = "GEO"
                sample_object.accession_code = sample_accession_code
                sample_object.organism = organism

                # If data processing step, it isn't raw.
                sample_object.has_raw = not sample.metadata.get(
                    'data_processing', None)

                ExperimentOrganismAssociation.objects.get_or_create(
                    experiment=experiment_object, organism=organism)
                sample_object.title = sample.metadata['title'][0]

                self.set_platform_properties(sample_object, sample.metadata,
                                             gse)

                # Directly assign the harmonized properties
                harmonized_sample = harmonized_samples[sample_object.title]
                for key, value in harmonized_sample.items():
                    setattr(sample_object, key, value)

                # Sample-level protocol_info
                sample_object.protocol_info = self.get_sample_protocol_info(
                    sample.metadata, sample_accession_code)

                sample_object.save()
                logger.debug("Created Sample: " + str(sample_object))

                sample_annotation = SampleAnnotation()
                sample_annotation.sample = sample_object
                sample_annotation.data = sample.metadata
                sample_annotation.is_ccdl = False
                sample_annotation.save()

                sample_supplements = sample.metadata.get(
                    'supplementary_file', [])
                for supplementary_file_url in sample_supplements:

                    # Why do they give us this?
                    if supplementary_file_url == "NONE":
                        break

                    # We never want these!
                    if "idat.gz" in supplementary_file_url.lower():
                        continue
                    if "chp.gz" in supplementary_file_url.lower():
                        continue
                    if "ndf.gz" in supplementary_file_url.lower():
                        continue
                    if "pos.gz" in supplementary_file_url.lower():
                        continue
                    if "pair.gz" in supplementary_file_url.lower():
                        continue
                    if "gff.gz" in supplementary_file_url.lower():
                        continue

                    # Sometimes, we are lied to about the data processing step.
                    lower_file_url = supplementary_file_url.lower()
                    if '.cel' in lower_file_url \
                    or ('_non_normalized.txt' in lower_file_url) \
                    or ('_non-normalized.txt' in lower_file_url) \
                    or ('-non-normalized.txt' in lower_file_url) \
                    or ('-non_normalized.txt' in lower_file_url):
                        sample_object.has_raw = True
                        sample_object.save()

                    # filename and source_filename are the same for these
                    filename = supplementary_file_url.split('/')[-1]
                    original_file = OriginalFile.objects.get_or_create(
                        source_url=supplementary_file_url,
                        filename=filename,
                        source_filename=filename,
                        has_raw=sample_object.has_raw,
                        is_archive=True)[0]

                    logger.debug("Created OriginalFile: " + str(original_file))

                    original_file_sample_association = OriginalFileSampleAssociation.objects.get_or_create(
                        original_file=original_file, sample=sample_object)

                    if original_file.is_affy_data():
                        # Only Affymetrix Microarrays produce .CEL files
                        sample_object.technology = 'MICROARRAY'
                        sample_object.manufacturer = 'AFFYMETRTIX'
                        sample_object.save()

                # It's okay to survey RNA-Seq samples from GEO, but we
                # don't actually want to download/process any RNA-Seq
                # data unless it comes from SRA.
                if sample_object.technology != 'RNA-SEQ':
                    created_samples.append(sample_object)

                # Now that we've determined the technology at the
                # sample level, we can set it at the experiment level,
                # just gotta make sure to only do it once. There can
                # be more than one technology, this should be changed
                # as part of:
                # https://github.com/AlexsLemonade/refinebio/issues/1099
                if not experiment_object.technology:
                    experiment_object.technology = sample_object.technology
                    experiment_object.save()

                ExperimentSampleAssociation.objects.get_or_create(
                    experiment=experiment_object, sample=sample_object)

        # These supplementary files _may-or-may-not_ contain the type of raw data we can process.
        for experiment_supplement_url in gse.metadata.get(
                'supplementary_file', []):

            # filename and source_filename are the same for these
            filename = experiment_supplement_url.split('/')[-1]
            original_file = OriginalFile.objects.get_or_create(
                source_url=experiment_supplement_url,
                filename=filename,
                source_filename=filename,
                has_raw=sample_object.has_raw,
                is_archive=True)[0]

            logger.debug("Created OriginalFile: " + str(original_file))

            lower_supplement_url = experiment_supplement_url.lower()
            if ('_non_normalized.txt' in lower_supplement_url) \
            or ('_non-normalized.txt' in lower_supplement_url) \
            or ('-non-normalized.txt' in lower_supplement_url) \
            or ('-non_normalized.txt' in lower_supplement_url):
                for sample_object in created_samples:
                    sample_object.has_raw = True
                    sample_object.save()

                    OriginalFileSampleAssociation.objects.get_or_create(
                        sample=sample_object, original_file=original_file)

            # Delete this Original file if it isn't being used.
            if OriginalFileSampleAssociation.objects.filter(
                    original_file=original_file).count() == 0:
                original_file.delete()

        # These are the Miniml/Soft/Matrix URLs that are always(?) provided.
        # GEO describes different types of data formatting as "families"
        family_url = self.get_miniml_url(experiment_accession_code)
        miniml_original_file = OriginalFile.objects.get_or_create(
            source_url=family_url,
            source_filename=family_url.split('/')[-1],
            has_raw=sample_object.has_raw,
            is_archive=True)[0]
        for sample_object in created_samples:
            # We don't need a .txt if we have a .CEL
            if sample_object.has_raw:
                continue
            OriginalFileSampleAssociation.objects.get_or_create(
                sample=sample_object, original_file=miniml_original_file)

        # Delete this Original file if it isn't being used.
        if OriginalFileSampleAssociation.objects.filter(
                original_file=miniml_original_file).count() == 0:
            miniml_original_file.delete()

        # Trash the temp path
        try:
            shutil.rmtree(self.get_temp_path())
        except Exception:
            # There was a problem during surveying so this didn't get created.
            # It's not a big deal.
            pass

        return experiment_object, created_samples