示例#1
0
    def test_supported_microarray_platforms(self):
        """Test that supported microarray platforms setting is set correctly."""
        supported_microarray_platforms = utils.get_supported_microarray_platforms(
        )

        has_equgene11st = False
        has_A_AFFY_59 = False
        has_GPL23026 = False
        has_AGEOD23026 = False
        for platform in supported_microarray_platforms:
            if platform["platform_accession"] == "equgene11st" and platform[
                    "is_brainarray"]:
                has_equgene11st = True

            if platform["external_accession"] == "A-AFFY-59" and not platform[
                    "is_brainarray"]:
                has_A_AFFY_59 = True

            if platform["external_accession"] == "GPL23026" and not platform[
                    "is_brainarray"]:
                has_GPL23026 = True

            if platform[
                    "external_accession"] == "A-GEOD-23026" and not platform[
                        "is_brainarray"]:
                has_AGEOD23026 = True

        self.assertTrue(has_equgene11st)
        self.assertTrue(has_A_AFFY_59)
        self.assertTrue(has_GPL23026)
        self.assertTrue(has_AGEOD23026)
示例#2
0
def _is_platform_supported(platform: str) -> bool:
    """Determines if platform is a platform_accession we support or not.

    It does so by trying to correct for common string issues such as
    case and spacing and then comparing against our configuration
    files which specify which platform are supported.
    """
    upper_platform = platform.upper()

    # Check if this is a supported Microarray platform.
    for supported_platform in utils.get_supported_microarray_platforms():
        if (supported_platform["platform_accession"].upper() == upper_platform
                or supported_platform["external_accession"].upper()
                == upper_platform):
            return True

    # Check if this is a supported RNASeq platform.
    # GEO RNASeq platform titles often have organisms appended to
    # an otherwise recognizable platform. The list of supported
    # RNASeq platforms isn't long, so see if any of them are
    # contained within what GEO gave us.
    # Example: GSE69572 has a platform title of:
    # 'Illumina Genome Analyzer IIx (Glycine max)'
    # Which should match 'Illumina Genome Analyzer IIx'
    # because RNASeq platforms are organism agnostic.
    for supported_platform in utils.get_supported_rnaseq_platforms():
        # Spacing can be inconsistent, easiest to just remove it entirely.
        if supported_platform.upper().replace(" ",
                                              "") in upper_platform.replace(
                                                  " ", ""):
            return True

    return False
示例#3
0
 def get_queryset(self):
     """ We use this opportunity not to include any experiments that only contain unsupported platforms from our ES instance"""
     supported_microarray_platforms = [
         x['platform_accession']
         for x in get_supported_microarray_platforms()
     ]
     supported_rnaseq_platforms = [
         x.replace(' ', '') for x in get_supported_rnaseq_platforms()
     ]
     all_supported_platforms = supported_microarray_platforms + supported_rnaseq_platforms
     return super(ExperimentDocument, self).get_queryset().filter(
         platform_accession_codes__contained_by=all_supported_platforms
     )  # https://www.postgresql.org/docs/9.1/functions-array.html
    def handle(self, *args, **options):
        """ Requeues downloader jobs for samples that haven't been processed and their original files
        have no no downloader jobs associated with them
        """
        supported_microarray_platforms = [
            x["platform_accession"] for x in get_supported_microarray_platforms()
        ]
        supported_rnaseq_platforms = [x.replace(" ", "") for x in get_supported_rnaseq_platforms()]
        all_supported_platforms = (
            supported_microarray_platforms + supported_rnaseq_platforms
        )  # https://www.postgresql.org/docs/9.1/functions-array.html

        # Ensure selected samples have valid platforms
        samples_without_downloader = (
            Sample.objects.all()
            .filter(platform_accession_code__in=all_supported_platforms)
            .annotate(
                original_files_count=Count("original_files"),
                downloader_job_count=Count("original_files__downloader_jobs"),
            )
            .filter(is_processed=False, original_files_count__gt=0, downloader_job_count=0)
        )
        if options.get("created_after", None):
            samples_without_downloader = samples_without_downloader.filter(
                created_at__gt=options["created_after"]
            )

        samples_without_downloader = samples_without_downloader.prefetch_related("original_files")

        logger.info(
            "Found %d samples without downloader jobs, starting to create them now.",
            samples_without_downloader.count(),
        )

        paginator = Paginator(samples_without_downloader, PAGE_SIZE)
        page = paginator.page()

        while True:
            for sample in page.object_list:
                logger.debug("Creating downloader job for a sample.", sample=sample.accession_code)
                create_downloader_job(sample.original_files.all())

            logger.info(
                "Created %d new downloader jobs because their samples didn't have any.", PAGE_SIZE
            )

            if not page.has_next():
                break

            page = paginator.page(page.next_page_number())
示例#5
0
文件: geo.py 项目: erflynn/refinebio
    def set_platform_properties(self, sample_object: Sample,
                                sample_metadata: Dict,
                                gse: GEOparse.GSM) -> Sample:
        """Sets platform-related properties on `sample_object`.

        Uses metadata from `gse` to populate platform_name,
        platform_accession_code, and technology on `sample_object`.
        """

        # Determine platform information
        external_accession = get_normalized_platform(
            gse.metadata.get("platform_id", [UNKNOWN])[0])

        if external_accession == UNKNOWN:
            sample_object.platform_accession_code = UNKNOWN
            sample_object.platform_name = UNKNOWN
            sample_object.manufacturer = UNKNOWN
            # If this sample is Affy, we potentially can extract the
            # platform information from the .CEL file. If it's not we
            # can't do anything. Therefore assume the technology is
            # microarray when we have no platform information.
            sample_object.technology = "MICROARRAY"
            return sample_object

        platform_accession_code = UNKNOWN

        gpl = GEOparse.get_GEO(external_accession,
                               destdir=self.get_temp_path(),
                               how="brief",
                               silent=True)
        platform_title = gpl.metadata.get("title", [UNKNOWN])[0]

        # Check if this is a supported microarray platform.
        for platform in get_supported_microarray_platforms():
            if platform["external_accession"] == external_accession:
                platform_accession_code = platform["platform_accession"]

        if platform_accession_code != UNKNOWN:
            # It's a supported microarray platform.

            # We are using the brain array package as the platform accession code,
            # so, for instance, GPL3213 becomes 'chicken'.
            sample_object.platform_accession_code = platform_accession_code
            sample_object.technology = "MICROARRAY"
            try:

                # Related: https://github.com/AlexsLemonade/refinebio/issues/354
                # If it's Affy we can get a readable name:
                sample_object.platform_name = get_readable_affymetrix_names(
                )[platform_accession_code]
                sample_object.manufacturer = "AFFYMETRIX"

                # Sometimes Affymetrix samples have weird channel
                # protocol metadata, so if we find that it's
                # Affymetrix return it now. Example: GSE113945
                return sample_object
            except KeyError:
                # Otherwise we'll use what we've got.
                sample_object.platform_name = platform_title

            # Determine manufacturer

            platform = sample_object.pretty_platform.upper()
            if "AGILENT" in platform:
                sample_object.manufacturer = "AGILENT"
            elif "ILLUMINA" in platform or "NEXTSEQ" in platform:
                sample_object.manufacturer = "ILLUMINA"
            elif "AFFYMETRIX" in platform:
                sample_object.manufacturer = "AFFYMETRIX"
            else:
                sample_object.manufacturer = UNKNOWN

            return sample_object

        # Check to see if this is a supported RNASeq technology:

        # GEO RNASeq platform titles often have organisms appended to
        # an otherwise recognizable platform. The list of supported
        # RNASeq platforms isn't long, so see if any of them are
        # contained within what GEO gave us.
        # Example: GSE69572 has a platform title of:
        # 'Illumina Genome Analyzer IIx (Glycine max)'
        # Which should really just be 'Illumina Genome Analyzer IIx'
        # because RNASeq platforms are organism agnostic.  However,
        # the platforms 'Illumina Genome Analyzer' and 'Illumina
        # Genome Analyzer II' would also be matched, so make sure that
        # the longest platform names are tested first:
        sorted_platform_list = get_supported_rnaseq_platforms().copy()
        sorted_platform_list.sort(key=len, reverse=True)

        for platform in sorted_platform_list:
            if platform.upper() in platform_title.upper():
                sample_object.technology = "RNA-SEQ"
                sample_object.platform_name = platform
                # We just use RNASeq platform titles as accessions
                sample_object.platform_accession_code = platform

                if "ILLUMINA" in sample_object.platform_name.upper():
                    sample_object.manufacturer = "ILLUMINA"
                elif "NEXTSEQ" in sample_object.platform_name.upper():
                    sample_object.manufacturer = "NEXTSEQ"
                elif "ION TORRENT" in sample_object.platform_name.upper():
                    sample_object.manufacturer = "ION_TORRENT"
                else:
                    sample_object.manufacturer = UNKNOWN

                return sample_object

        # If we've made it this far, we don't know what this platform
        # is, therefore we can't know what its technology is. What we
        # do know is what GEO said was it's platform's accession and
        # title are, and that it's unsupported.
        sample_object.platform_name = platform_title
        sample_object.platform_accession_code = external_accession
        sample_object.technology = UNKNOWN
        sample_object.manufacturer = UNKNOWN

        return sample_object
示例#6
0
    def create_experiment_from_api(
            self, experiment_accession_code: str) -> (Experiment, Dict):
        """Given an experiment accession code, create an Experiment object.

        Also returns a dictionary of additional information about the
        platform discovered for the experiment.

        Will raise an UnsupportedPlatformException if this experiment was
        conducted using a platform which we don't support.

        See an example at: https://www.ebi.ac.uk/arrayexpress/json/v3/experiments/E-MTAB-3050/sample
        """
        request_url = EXPERIMENTS_URL + experiment_accession_code
        experiment_request = utils.requests_retry_session().get(request_url,
                                                                timeout=60)

        try:
            parsed_json = experiment_request.json(
            )["experiments"]["experiment"][0]
        except KeyError:
            logger.error("Remote experiment has no Experiment data!",
                         experiment_accession_code=experiment_accession_code,
                         survey_job=self.survey_job.id)
            raise

        experiment = {}
        experiment["name"] = parsed_json["name"]
        experiment["experiment_accession_code"] = experiment_accession_code

        # This experiment has no platform at all, and is therefore useless.
        if 'arraydesign' not in parsed_json or len(
                parsed_json["arraydesign"]) == 0:
            logger.warn("Remote experiment has no arraydesign listed.",
                        experiment_accession_code=experiment_accession_code,
                        survey_job=self.survey_job.id)
            raise UnsupportedPlatformException
        # If there is more than one arraydesign listed in the experiment
        # then there is no other way to determine which array was used
        # for which sample other than looking at the header of the CEL
        # file. That obviously cannot happen until the CEL file has been
        # downloaded so we can just mark it as UNKNOWN and let the
        # downloader inspect the downloaded file to determine the
        # array then.
        elif len(parsed_json["arraydesign"]
                 ) != 1 or "accession" not in parsed_json["arraydesign"][0]:
            experiment["platform_accession_code"] = UNKNOWN
            experiment["platform_accession_name"] = UNKNOWN
            experiment["manufacturer"] = UNKNOWN
        else:
            external_accession = parsed_json["arraydesign"][0]["accession"]
            for platform in get_supported_microarray_platforms():
                if platform["external_accession"] == external_accession:
                    experiment[
                        "platform_accession_code"] = get_normalized_platform(
                            platform["platform_accession"])

                    # Illumina appears in the accession codes for
                    # platforms manufactured by Illumina
                    if "ILLUMINA" in experiment[
                            "platform_accession_code"].upper():
                        experiment["manufacturer"] = "ILLUMINA"
                        experiment["platform_accession_name"] = platform[
                            "platform_accession"]
                    else:
                        # It's not Illumina, the only other supported Microarray platform is
                        # Affy. As our list of supported platforms grows this logic will
                        # need to get more sophisticated.
                        experiment["manufacturer"] = "AFFYMETRIX"
                        platform_mapping = get_readable_affymetrix_names()
                        experiment[
                            "platform_accession_name"] = platform_mapping[
                                platform["platform_accession"]]

            if "platform_accession_code" not in experiment:
                # We don't know what platform this accession corresponds to.
                experiment["platform_accession_code"] = external_accession
                experiment["platform_accession_name"] = UNKNOWN
                experiment["manufacturer"] = UNKNOWN

        experiment["release_date"] = parsed_json["releasedate"]

        if "lastupdatedate" in parsed_json:
            experiment["last_update_date"] = parsed_json["lastupdatedate"]
        else:
            experiment["last_update_date"] = parsed_json["releasedate"]

        # Create the experiment object
        try:
            experiment_object = Experiment.objects.get(
                accession_code=experiment_accession_code)
            logger.debug(
                "Experiment already exists, skipping object creation.",
                experiment_accession_code=experiment_accession_code,
                survey_job=self.survey_job.id)
        except Experiment.DoesNotExist:
            # We aren't sure these fields will be populated, or how many there will be.
            # Try to join them all together, or set a sensible default.
            experiment_descripton = ""
            if "description" in parsed_json and len(
                    parsed_json["description"]) > 0:
                for description_item in parsed_json["description"]:
                    if "text" in description_item:
                        experiment_descripton = experiment_descripton + description_item[
                            "text"] + "\n"

            if experiment_descripton == "":
                experiment_descripton = "Description not available.\n"

            experiment_object = Experiment()
            experiment_object.accession_code = experiment_accession_code
            experiment_object.source_url = request_url
            experiment_object.source_database = "ARRAY_EXPRESS"
            experiment_object.title = parsed_json["name"]
            # This will need to be updated if we ever use Array
            # Express to get other kinds of data.
            experiment_object.technology = "MICROARRAY"
            experiment_object.description = experiment_descripton
            experiment_object.source_first_published = parse_datetime(
                experiment["release_date"])
            experiment_object.source_last_modified = parse_datetime(
                experiment["last_update_date"])
            experiment_object.save()

            json_xa = ExperimentAnnotation()
            json_xa.experiment = experiment_object
            json_xa.data = parsed_json
            json_xa.is_ccdl = False
            json_xa.save()

            ## Fetch and parse the IDF/SDRF file for any other fields
            IDF_URL_TEMPLATE = "https://www.ebi.ac.uk/arrayexpress/files/{code}/{code}.idf.txt"
            idf_url = IDF_URL_TEMPLATE.format(code=experiment_accession_code)
            idf_text = utils.requests_retry_session().get(idf_url,
                                                          timeout=60).text

            lines = idf_text.split('\n')
            idf_dict = {}
            for line in lines:
                keyval = line.strip().split('\t')
                if len(keyval) == 2:
                    idf_dict[keyval[0]] = keyval[1]
                elif len(keyval) > 2:
                    idf_dict[keyval[0]] = keyval[1:]

            idf_xa = ExperimentAnnotation()
            idf_xa.data = idf_dict
            idf_xa.experiment = experiment_object
            idf_xa.is_ccdl = False
            idf_xa.save()

            if 'Investigation Title' in idf_dict:
                experiment_object.title = idf_dict['Investigation Title']
            if 'Person Affiliation' in idf_dict:
                # This is very rare, ex: E-MEXP-32
                if isinstance(idf_dict['Person Affiliation'], list):

                    unique_people = list(set(idf_dict['Person Affiliation']))
                    experiment_object.submitter_institution = ", ".join(
                        unique_people)[:255]
                else:
                    experiment_object.submitter_institution = idf_dict[
                        'Person Affiliation']

            # Get protocol_description from "<experiment_url>/protocols"
            # instead of from idf_dict, because the former provides more
            # details.
            protocol_url = request_url + '/protocols'
            protocol_request = utils.requests_retry_session().get(protocol_url,
                                                                  timeout=60)
            try:
                experiment_object.protocol_description = protocol_request.json(
                )['protocols']
            except KeyError:
                logger.warning(
                    "Remote experiment has no protocol data!",
                    experiment_accession_code=experiment_accession_code,
                    survey_job=self.survey_job.id)

            if 'Publication Title' in idf_dict:
                # This will happen for some superseries.
                # Ex: E-GEOD-29536
                # Assume most recent is "best:, store the rest in experiment annotation.
                if isinstance(idf_dict['Publication Title'], list):
                    experiment_object.publication_title = "; ".join(
                        idf_dict['Publication Title'])
                else:
                    experiment_object.publication_title = idf_dict[
                        'Publication Title']
                experiment_object.has_publication = True
            if 'Publication DOI' in idf_dict:
                if isinstance(idf_dict['Publication DOI'], list):
                    experiment_object.publication_doi = ", ".join(
                        idf_dict['Publication DOI'])
                else:
                    experiment_object.publication_doi = idf_dict[
                        'Publication DOI']
                experiment_object.has_publication = True
            if 'PubMed ID' in idf_dict:
                if isinstance(idf_dict['PubMed ID'], list):
                    experiment_object.pubmed_id = ", ".join(
                        idf_dict['PubMed ID'])
                else:
                    experiment_object.pubmed_id = idf_dict['PubMed ID']
                experiment_object.has_publication = True

            # Scrape publication title and authorship from Pubmed
            if experiment_object.pubmed_id:
                pubmed_metadata = utils.get_title_and_authors_for_pubmed_id(
                    experiment_object.pubmed_id)
                experiment_object.publication_title = pubmed_metadata[0]
                experiment_object.publication_authors = pubmed_metadata[1]

            experiment_object.save()

        platform_dict = {}
        for k in ('platform_accession_code', 'platform_accession_name',
                  'manufacturer'):
            platform_dict[k] = experiment[k]

        return experiment_object, platform_dict
示例#7
0
def download_array_express(job_id: int) -> None:
    """The main function for the Array Express Downloader.

    Downloads a single zip file containing the .PCL files representing
    samples relating to a single experiement stored in
    ArrayExpress.
    """
    job = utils.start_job(job_id)
    success = True

    file_assocs = DownloaderJobOriginalFileAssociation.objects.filter(downloader_job=job)
    # AE will have multiple files per DownloaderJob, but they are all
    # pieces of the same zip file so they're all referencing the same
    # URL.
    original_file = file_assocs[0].original_file
    url = original_file.source_url
    accession_code = job.accession_code

    # First, get all the unique sample archive URLs.
    # There may be more than one!
    # Then, unpack all the ones downloaded.
    # Then create processor jobs!

    og_files = []
    # The files for all of the samples are
    # contained within the same zip file. Therefore only
    # download the one.
    os.makedirs(LOCAL_ROOT_DIR + '/' + accession_code, exist_ok=True)

    # Add a timestamp in milliseconds to filename to prevent multiple jobs from using the same file.
    filename = url.split('/')[-1] + "." + str(int(time.time() * 1000))
    dl_file_path = LOCAL_ROOT_DIR + '/' + accession_code + '/' + filename + ".zip"
    _download_file(url, dl_file_path, job)

    extracted_files = _extract_files(dl_file_path, accession_code, job)

    for og_file in extracted_files:
        try:
            original_file = OriginalFile.objects.get(
                source_filename=og_file['filename'], source_url=original_file.source_url)
            original_file.is_downloaded = True
            original_file.is_archive = False
            original_file.absolute_file_path = og_file['absolute_path']
            original_file.filename = og_file['absolute_path'].split('/')[-1]
            original_file.calculate_size()
            original_file.save()
            original_file.calculate_sha1()
            og_files.append(original_file)
        except Exception:
            # The suspicion is that there are extra files related to
            # another experiment, that we don't want associated with
            # this one.
            logger.debug("Found a file we didn't have an OriginalFile for! Why did this happen?: "
                        + og_file['filename'],
                        downloader_job=job_id)
            os.remove(og_file["absolute_path"])
            continue

        sample_objects = Sample.objects.filter(originalfile=original_file).order_by('created_at')
        if sample_objects.count() > 1:
            logger.warn("Found an Array Express OriginalFile with more than one sample: %s",
                        filename,
                        downloader_job=job_id)

        # If the file is a .CEL file, it is the ultimate
        # source of truth about the sample's platform.
        sample_object = sample_objects[0]
        if og_file["filename"].upper()[-4:] == ".CEL" and sample_object.has_raw:
            cel_file_platform = None
            platform_accession_code = "UNSUPPORTED"
            try:
                cel_file_platform = microarray.get_platform_from_CEL(
                    original_file.absolute_file_path)

                for platform in get_supported_microarray_platforms():
                    if platform["platform_accession"] == cel_file_platform:
                        platform_accession_code = platform["platform_accession"]
            except Exception as e:
                platform_accession_code = "UNDETERMINABLE"
                logger.warn("Unable to determine platform from CEL file: "
                            + original_file.absolute_file_path,
                            downloader_job=job_id)
            if platform_accession_code == "UNSUPPORTED":
                logger.error("Found a raw .CEL file with an unsupported platform!",
                             file_name=original_file.absolute_file_path,
                             sample=sample_object.id,
                             downloader_job=job_id,
                             cel_file_platform=cel_file_platform)
                job.failure_reason = ("Found a raw .CEL file with an unsupported platform: "
                                      + original_file.absolute_file_path + " ("
                                      + str(cel_file_platform) + ")")
                job.no_retry = True
                success = False

                # The file is unsupported, delete it!
                original_file.delete_local_file()
                original_file.delete()
            elif platform_accession_code == "UNDETERMINABLE":
                # If we cannot determine the platform from the
                # .CEL file, the platform discovered via metadata
                # may be correct so just leave it be.
                pass
            else:
                # We determined the file was collected with a supported Affymetrix platform.
                sample_object.platform_accession_code = platform_accession_code
                sample_object.platform_name = get_readable_affymetrix_names()[
                    platform_accession_code]

            # However, if the filename contains '.CEL' we know
            # it's an Affymetrix Microarray
            sample_object.technology = "MICROARRAY"
            sample_object.manufacterer = "AFFYMETRIX"
            sample_object.save()

    if success:
        logger.debug("File downloaded and extracted successfully.",
                     url=url,
                     downloader_job=job_id)

        utils.create_processor_jobs_for_original_files(og_files, job)

    utils.end_downloader_job(job, success)