def make_sample_platform_names_readable(apps, schema_editor):
    '''Makes the platform_names on the Sample model human readable.
    We were removing spaces from rnaSeq samples' platforms, which made
    them unfriendly to users. This reverts the platform names to the
    original that didn't have spaces in them.
    We do this by munging the platform name on both the Sample object
    and the list of rnaSeq platforms so that they are comparable, then
    if they match we set the Sample.platform_name to what the list had
    originally.
    We can't import the Sample model directly as it may be a newer
    version than this migration expects. We use the historical version.
    '''
    Sample = apps.get_model('data_refinery_common', 'Sample')

    # Build a mapping of strings that will match with munged sample platform_names
    # to the platforms that they represent.
    platform_mapping = {}
    for platform in get_supported_rnaseq_platforms():
        platform_mapping[platform.replace(' ', '').upper()] = platform

    paginator = Paginator(Sample.objects.all(), 200)

    for page_idx in range(1, paginator.num_pages):
        for sample in paginator.page(page_idx).object_list:
            munged_sample_platform = sample.platform_name.replace(' ',
                                                                  '').upper()
            if munged_sample_platform in platform_mapping:
                sample.platform_name = platform_mapping[munged_sample_platform]
                sample.save()
        print("Updating page " + str(page_idx))
예제 #2
0
def _is_platform_supported(platform: str) -> bool:
    """Determines if platform is a platform_accession we support or not.

    It does so by trying to correct for common string issues such as
    case and spacing and then comparing against our configuration
    files which specify which platform are supported.
    """
    upper_platform = platform.upper()

    # Check if this is a supported Microarray platform.
    for supported_platform in utils.get_supported_microarray_platforms():
        if (supported_platform["platform_accession"].upper() == upper_platform
                or supported_platform["external_accession"].upper()
                == upper_platform):
            return True

    # Check if this is a supported RNASeq platform.
    # GEO RNASeq platform titles often have organisms appended to
    # an otherwise recognizable platform. The list of supported
    # RNASeq platforms isn't long, so see if any of them are
    # contained within what GEO gave us.
    # Example: GSE69572 has a platform title of:
    # 'Illumina Genome Analyzer IIx (Glycine max)'
    # Which should match 'Illumina Genome Analyzer IIx'
    # because RNASeq platforms are organism agnostic.
    for supported_platform in utils.get_supported_rnaseq_platforms():
        # Spacing can be inconsistent, easiest to just remove it entirely.
        if supported_platform.upper().replace(" ",
                                              "") in upper_platform.replace(
                                                  " ", ""):
            return True

    return False
예제 #3
0
 def get_queryset(self):
     """ We use this opportunity not to include any experiments that only contain unsupported platforms from our ES instance"""
     supported_microarray_platforms = [
         x['platform_accession']
         for x in get_supported_microarray_platforms()
     ]
     supported_rnaseq_platforms = [
         x.replace(' ', '') for x in get_supported_rnaseq_platforms()
     ]
     all_supported_platforms = supported_microarray_platforms + supported_rnaseq_platforms
     return super(ExperimentDocument, self).get_queryset().filter(
         platform_accession_codes__contained_by=all_supported_platforms
     )  # https://www.postgresql.org/docs/9.1/functions-array.html
    def handle(self, *args, **options):
        """ Requeues downloader jobs for samples that haven't been processed and their original files
        have no no downloader jobs associated with them
        """
        supported_microarray_platforms = [
            x["platform_accession"] for x in get_supported_microarray_platforms()
        ]
        supported_rnaseq_platforms = [x.replace(" ", "") for x in get_supported_rnaseq_platforms()]
        all_supported_platforms = (
            supported_microarray_platforms + supported_rnaseq_platforms
        )  # https://www.postgresql.org/docs/9.1/functions-array.html

        # Ensure selected samples have valid platforms
        samples_without_downloader = (
            Sample.objects.all()
            .filter(platform_accession_code__in=all_supported_platforms)
            .annotate(
                original_files_count=Count("original_files"),
                downloader_job_count=Count("original_files__downloader_jobs"),
            )
            .filter(is_processed=False, original_files_count__gt=0, downloader_job_count=0)
        )
        if options.get("created_after", None):
            samples_without_downloader = samples_without_downloader.filter(
                created_at__gt=options["created_after"]
            )

        samples_without_downloader = samples_without_downloader.prefetch_related("original_files")

        logger.info(
            "Found %d samples without downloader jobs, starting to create them now.",
            samples_without_downloader.count(),
        )

        paginator = Paginator(samples_without_downloader, PAGE_SIZE)
        page = paginator.page()

        while True:
            for sample in page.object_list:
                logger.debug("Creating downloader job for a sample.", sample=sample.accession_code)
                create_downloader_job(sample.original_files.all())

            logger.info(
                "Created %d new downloader jobs because their samples didn't have any.", PAGE_SIZE
            )

            if not page.has_next():
                break

            page = paginator.page(page.next_page_number())
def remove_spaces_from_platform_accessions(apps, schema_editor):
    Sample = apps.get_model("data_refinery_common", "Sample")

    # get_supported_rnaseq_platforms() returns the platform names with
    # spaces, but our platform accessions should have no spaces
    for bad_accession in get_supported_rnaseq_platforms():
        platform_accession = bad_accession.replace(" ", "")
        bad_samples = Sample.objects.all().filter(
            platform_accession_code=bad_accession)

        if not bad_samples:
            continue

        bad_samples.update(platform_accession_code=platform_accession,
                           last_modified=timezone.now())
        print("Updating platform accession from '%s' to '%s'" %
              (bad_accession, platform_accession))
예제 #6
0
파일: geo.py 프로젝트: erflynn/refinebio
    def set_platform_properties(self, sample_object: Sample,
                                sample_metadata: Dict,
                                gse: GEOparse.GSM) -> Sample:
        """Sets platform-related properties on `sample_object`.

        Uses metadata from `gse` to populate platform_name,
        platform_accession_code, and technology on `sample_object`.
        """

        # Determine platform information
        external_accession = get_normalized_platform(
            gse.metadata.get("platform_id", [UNKNOWN])[0])

        if external_accession == UNKNOWN:
            sample_object.platform_accession_code = UNKNOWN
            sample_object.platform_name = UNKNOWN
            sample_object.manufacturer = UNKNOWN
            # If this sample is Affy, we potentially can extract the
            # platform information from the .CEL file. If it's not we
            # can't do anything. Therefore assume the technology is
            # microarray when we have no platform information.
            sample_object.technology = "MICROARRAY"
            return sample_object

        platform_accession_code = UNKNOWN

        gpl = GEOparse.get_GEO(external_accession,
                               destdir=self.get_temp_path(),
                               how="brief",
                               silent=True)
        platform_title = gpl.metadata.get("title", [UNKNOWN])[0]

        # Check if this is a supported microarray platform.
        for platform in get_supported_microarray_platforms():
            if platform["external_accession"] == external_accession:
                platform_accession_code = platform["platform_accession"]

        if platform_accession_code != UNKNOWN:
            # It's a supported microarray platform.

            # We are using the brain array package as the platform accession code,
            # so, for instance, GPL3213 becomes 'chicken'.
            sample_object.platform_accession_code = platform_accession_code
            sample_object.technology = "MICROARRAY"
            try:

                # Related: https://github.com/AlexsLemonade/refinebio/issues/354
                # If it's Affy we can get a readable name:
                sample_object.platform_name = get_readable_affymetrix_names(
                )[platform_accession_code]
                sample_object.manufacturer = "AFFYMETRIX"

                # Sometimes Affymetrix samples have weird channel
                # protocol metadata, so if we find that it's
                # Affymetrix return it now. Example: GSE113945
                return sample_object
            except KeyError:
                # Otherwise we'll use what we've got.
                sample_object.platform_name = platform_title

            # Determine manufacturer

            platform = sample_object.pretty_platform.upper()
            if "AGILENT" in platform:
                sample_object.manufacturer = "AGILENT"
            elif "ILLUMINA" in platform or "NEXTSEQ" in platform:
                sample_object.manufacturer = "ILLUMINA"
            elif "AFFYMETRIX" in platform:
                sample_object.manufacturer = "AFFYMETRIX"
            else:
                sample_object.manufacturer = UNKNOWN

            return sample_object

        # Check to see if this is a supported RNASeq technology:

        # GEO RNASeq platform titles often have organisms appended to
        # an otherwise recognizable platform. The list of supported
        # RNASeq platforms isn't long, so see if any of them are
        # contained within what GEO gave us.
        # Example: GSE69572 has a platform title of:
        # 'Illumina Genome Analyzer IIx (Glycine max)'
        # Which should really just be 'Illumina Genome Analyzer IIx'
        # because RNASeq platforms are organism agnostic.  However,
        # the platforms 'Illumina Genome Analyzer' and 'Illumina
        # Genome Analyzer II' would also be matched, so make sure that
        # the longest platform names are tested first:
        sorted_platform_list = get_supported_rnaseq_platforms().copy()
        sorted_platform_list.sort(key=len, reverse=True)

        for platform in sorted_platform_list:
            if platform.upper() in platform_title.upper():
                sample_object.technology = "RNA-SEQ"
                sample_object.platform_name = platform
                # We just use RNASeq platform titles as accessions
                sample_object.platform_accession_code = platform

                if "ILLUMINA" in sample_object.platform_name.upper():
                    sample_object.manufacturer = "ILLUMINA"
                elif "NEXTSEQ" in sample_object.platform_name.upper():
                    sample_object.manufacturer = "NEXTSEQ"
                elif "ION TORRENT" in sample_object.platform_name.upper():
                    sample_object.manufacturer = "ION_TORRENT"
                else:
                    sample_object.manufacturer = UNKNOWN

                return sample_object

        # If we've made it this far, we don't know what this platform
        # is, therefore we can't know what its technology is. What we
        # do know is what GEO said was it's platform's accession and
        # title are, and that it's unsupported.
        sample_object.platform_name = platform_title
        sample_object.platform_accession_code = external_accession
        sample_object.technology = UNKNOWN
        sample_object.manufacturer = UNKNOWN

        return sample_object
예제 #7
0
 def test_supported_rnaseq_platforms(self):
     """Test that supported RNASeq platforms setting is set correctly."""
     self.assertTrue(
         "Illumina HiSeq 1000" in utils.get_supported_rnaseq_platforms())