Exemplo n.º 1
0
    def test_survey(self, mock_send_task):
        """A Simple test of the ArrayExpress surveyor."""
        survey_job = self.create_job_for_accession("E-MTAB-3050")
        ae_surveyor = ArrayExpressSurveyor(survey_job)
        ae_surveyor.survey()

        samples = Sample.objects.all()
        downloader_jobs = DownloaderJob.objects.all()

        # We are expecting this to discover 5 samples.
        self.assertEqual(samples.count(), 5)

        # And for one DownloaderJob to be created for all of them.
        self.assertEqual(downloader_jobs.count(), 1)

        sample = Sample.objects.first()
        self.assertTrue(' (hgu95av2)' in sample.pretty_platform)
        # Confirm the sample's protocol_info
        self.assertEqual(len(sample.protocol_info), 9)
        self.assertEqual(sample.protocol_info[0]['Accession'], "P-MTAB-41854")
        self.assertEqual(sample.protocol_info[0]['Text'], "Aliquoting of biomaterials.")
        self.assertEqual(sample.protocol_info[0]['Type'], "split")

        survey_job2 = self.create_job_for_accession("E-GEOD-44719")
        ae_surveyor = ArrayExpressSurveyor(survey_job2)
        ae_surveyor.survey()

        # We are expecting this to discover 77 samples.
        self.assertEqual(samples.count(), 77+5)

        # And for one DownloaderJob to be created for all of them.
        self.assertEqual(downloader_jobs.count(), 2)
Exemplo n.º 2
0
    def test_survey(self):
        survey_job = SurveyJob(source_type="ARRAY_EXPRESS")
        surveyor = ArrayExpressSurveyor(survey_job)
        file1 = File(download_url="a")
        file2 = File(download_url="a")
        file3 = File(download_url="b")
        file4 = File(download_url="a")
        batch1 = Batch(files=[file1])
        batch2 = Batch(files=[file2])
        batch3 = Batch(files=[file3, file4])

        surveyor.batches = [batch1, batch2, batch3]
        groups = surveyor.group_batches()
        self.assertEqual(groups, [[batch1, batch2], [batch3]])
Exemplo n.º 3
0
    def test_experiment_object(self, mock_get):
        """The get_experiment_metadata function extracts all experiment metadata
        from the experiments API."""
        mock_get.return_value = Mock(ok=True)
        mock_get.return_value.json.return_value = json.loads(EXPERIMENTS_JSON)

        ae_surveyor = ArrayExpressSurveyor(self.survey_job)
        experiment = ae_surveyor.get_experiment_metadata("E-MTAB-3050")
        self.assertEqual("Microarray analysis of in vitro differentiation",
                         experiment["name"])
        self.assertEqual("E-MTAB-3050",
                         experiment["experiment_accession_code"])
        self.assertEqual("A-AFFY-1", experiment["platform_accession_code"])
        self.assertEqual("2014-10-31", experiment["release_date"])
        self.assertEqual("2014-10-30", experiment["last_update_date"])
Exemplo n.º 4
0
    def test_survey_with_protocol_list(self):
        """Tests an edge case that came up after months:
        https://github.com/AlexsLemonade/refinebio/issues/761
        """
        survey_job = self.create_job_for_accession("E-MEXP-2381")
        ae_surveyor = ArrayExpressSurveyor(survey_job)
        ae_surveyor.survey()

        samples = Sample.objects.all()
        downloader_jobs = DownloaderJob.objects.all()

        # We are expecting this to discover 2 samples.
        self.assertEqual(samples.count(), 2)

        # And for one DownloaderJob to be created for all of them.
        self.assertEqual(downloader_jobs.count(), 1)
Exemplo n.º 5
0
    def test_survey(self, mock_send_task, mock_get):
        """The 'survey' function generates one Batch per sample.

        This test also tests the handle_batches method of ExternalSourceSurveyor
        which isn't tested on its own because it is an abstract class.
        """
        mock_send_task.return_value = Mock(ok=True)
        mock_get.side_effect = mocked_requests_get

        ae_surveyor = ArrayExpressSurveyor(self.survey_job)
        ae_surveyor.survey()

        downloader_jobs = DownloaderJob.objects.all()
        mock_send_task.assert_has_calls(
            [call(Downloaders.ARRAY_EXPRESS, downloader_jobs[0].id)])
        batches = Batch.objects.all()
        self.assertEqual(2, len(batches))
        self.assertEqual(1, len(downloader_jobs))

        batch = batches[0]
        self.assertEqual(batch.survey_job.id, self.survey_job.id)
        self.assertEqual(batch.source_type, "ARRAY_EXPRESS")
        self.assertEqual(batch.pipeline_required, "AFFY_TO_PCL")
        self.assertEqual(batch.platform_accession_code, "A-AFFY-1")
        self.assertEqual(batch.experiment_accession_code, "E-MTAB-3050")
        self.assertEqual(batch.experiment_title,
                         "Microarray analysis of in vitro differentiation")
        self.assertEqual(batch.status, "NEW")
        self.assertEqual(batch.release_date, datetime.date(2014, 10, 31))
        self.assertEqual(batch.last_uploaded_date, datetime.date(2014, 10, 30))
        self.assertEqual(batch.organism_id, 9606)
        self.assertEqual(batch.organism_name, "H**O SAPIENS")

        file = batch.files[0]
        self.assertEqual(file.size_in_bytes, -1)
        self.assertEqual(
            file.download_url,
            "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/MTAB/E-MTAB-3050/E-MTAB-3050.raw.1.zip"
        )  # noqa
        self.assertEqual(file.raw_format, "CEL")
        self.assertEqual(file.processed_format, "PCL")
        self.assertEqual(file.name, "C30057.CEL")
        self.assertEqual(file.internal_location, "A-AFFY-1/AFFY_TO_PCL")
Exemplo n.º 6
0
def _get_surveyor_for_source(survey_job: SurveyJob):
    """Factory method for ExternalSourceSurveyors."""
    if survey_job.source_type == "ARRAY_EXPRESS":
        return ArrayExpressSurveyor(survey_job)
    if survey_job.source_type == "SRA":
        return SraSurveyor(survey_job)
    if survey_job.source_type == "TRANSCRIPTOME_INDEX":
        return TranscriptomeIndexSurveyor(survey_job)
    else:
        raise SourceNotSupportedError("Source " + survey_job.source_type +
                                      " is not supported.")
Exemplo n.º 7
0
    def test_survey(self, mock_send_task):
        """A Simple test of the ArrayExpress surveyor."""
        survey_job = self.create_job_for_accession("E-MTAB-3050")
        ae_surveyor = ArrayExpressSurveyor(survey_job)
        ae_surveyor.survey()

        samples = Sample.objects.all()
        downloader_jobs = DownloaderJob.objects.all()

        # We are expecting this to discover 5 samples.
        self.assertEqual(samples.count(), 5)

        # And for one DownloaderJob to be created for all of them.
        self.assertEqual(downloader_jobs.count(), 1)

        experiment = Experiment.objects.first()
        self.assertEqual(experiment.accession_code, "E-MTAB-3050")
        self.assertEqual(experiment.source_first_published,
                         datetime.datetime(2014, 10, 31, tzinfo=timezone.utc))
        self.assertEqual(experiment.source_last_modified,
                         datetime.datetime(2014, 10, 30, tzinfo=timezone.utc))

        sample = Sample.objects.first()
        self.assertTrue(" (hgu95av2)" in sample.pretty_platform)
        # Confirm the sample's protocol_info
        self.assertEqual(len(sample.protocol_info), 9)
        self.assertEqual(sample.protocol_info[0]["Accession"], "P-MTAB-41854")
        self.assertEqual(sample.protocol_info[0]["Text"],
                         "Aliquoting of biomaterials.")
        self.assertEqual(sample.protocol_info[0]["Type"], "split")

        survey_job2 = self.create_job_for_accession("E-GEOD-44719")
        ae_surveyor = ArrayExpressSurveyor(survey_job2)
        ae_surveyor.survey()

        # We are expecting this to discover 77 samples.
        self.assertEqual(samples.count(), 77 + 5)

        # And for one DownloaderJob to be created for all of them.
        self.assertEqual(downloader_jobs.count(), 2)
Exemplo n.º 8
0
    def test_determine_accession(self):
        """Test of the `determine_sample_accession` function
        """
        survey_job = self.create_job_for_accession("E-MTAB-3050")
        ae_surveyor = ArrayExpressSurveyor(survey_job)

        EXPERIMENTS_URL = "https://www.ebi.ac.uk/arrayexpress/json/v3/experiments/"
        SAMPLES_URL = EXPERIMENTS_URL + "{}/samples"

        ex_accessions = [
                            "E-MTAB-3050",
                            "E-MEXP-669",
                            "E-MEXP-2215",
                            "E-MEXP-2288",
                            "E-MEXP-2381",
                            "E-MTAB-6739",
                        ]

        for ex_accession in ex_accessions:
            samples_endpoint = SAMPLES_URL.format(ex_accession)
            r = requests.get(samples_endpoint, timeout=60)
            samples = r.json()["experiment"]["sample"]

            # An experiment can have many samples
            for sample in samples:

                # For some reason, this sample has no files associated with it.
                if "file" not in sample or len(sample['file']) == 0:
                    continue

                # The accession code is not a simple matter to determine.
                sample_source_name = sample["source"].get("name", "")
                sample_assay_name = sample["assay"].get("name", "")

                has_raw = False
                for sub_file in sample['file']:

                    # For ex: E-GEOD-15645
                    if isinstance(sub_file['comment'], list):
                        sub_file_mod = sub_file
                        sub_file_mod['comment'] = sub_file['comment'][0]
                    else:
                        sub_file_mod = sub_file

                    if sub_file_mod['type'] == "data" and sub_file_mod['comment'].get('value', None) != None:
                        has_raw = True
                    if 'raw' in sub_file_mod['comment'].get('value', ''):
                        has_raw = True

                    # Skip derived data if we have it raw.
                    if has_raw and "derived data" in sub_file['type']:
                        continue
                    elif (not has_raw) and "derived data" not in sub_file['type']:
                        # If there is a platform warning then we don't want raw data.
                        has_raw = False
                        continue
                    filename = sub_file["name"]

                sample_accession_code = ae_surveyor.determine_sample_accession(
                    ex_accession,
                    sample_source_name,
                    sample_assay_name,
                    filename)
                self.assertTrue(sample_accession_code is not None)
    def handle(self, *args, **options):
        """Refreshes the metadata for all experiments, or experiments from a specific database
        """
        possible_source_databases = ["ARRAY_EXPRESS", "GEO", "SRA"]

        if options.get("source_database", None) is None:
            experiments = Experiment.objects.all()
        elif options["source_database"] in possible_source_databases:
            source_database = options["source_database"]
            experiments = Experiment.objects.filter(
                source_database=source_database)
        else:
            logger.error('Invalid source database "{}"'.format(
                options["source_database"]) +
                         "\nPossible source databases: {}".format(", ".join(
                             possible_source_databases)))
            sys.exit(1)

        paginator = PerformantPaginator(experiments, PAGE_SIZE)
        page = paginator.page()

        while True:
            for experiment in page.object_list:
                logger.debug("Refreshing metadata for an experiment.",
                             experiment=experiment.accession_code)
                try:
                    if experiment.source_database == "SRA":
                        metadata = SraSurveyor.gather_all_metadata(
                            experiment.samples.first().accession_code)
                        SraSurveyor._apply_metadata_to_experiment(
                            experiment, metadata)

                    elif experiment.source_database == "GEO":
                        gse = GEOparse.get_GEO(
                            experiment.accession_code,
                            destdir="/tmp/management",
                            silent=True,
                        )

                        GeoSurveyor._apply_metadata_to_experiment(
                            experiment, gse)

                    elif experiment.source_database == "ARRAY_EXPRESS":
                        request_url = EXPERIMENTS_URL + experiment.accession_code
                        experiment_request = utils.requests_retry_session(
                        ).get(request_url, timeout=60)
                        try:
                            parsed_json = experiment_request.json(
                            )["experiments"]["experiment"][0]
                        except KeyError:
                            logger.error(
                                "Remote experiment has no Experiment data!",
                                experiment_accession_code=experiment.
                                accession_code,
                                survey_job=self.survey_job.id,
                            )
                            continue
                        ArrayExpressSurveyor._apply_metadata_to_experiment(
                            experiment, parsed_json)

                    experiment.save()

                # If there are any errors, just continue. It's likely that it's
                # just a problem with this experiment.
                except Exception:
                    logger.exception(
                        "exception caught while updating metadata for {}".
                        format(experiment.accession_code))

            if not page.has_next():
                break
            else:
                page = paginator.page(page.next_page_number())

            # 2000 samples queued up every five minutes should be fast
            # enough and also not thrash the DB.
            time.sleep(60 * 5)
Exemplo n.º 10
0
    def handle(self, *args, **options):
        """Refreshes the metadata for all samples, or samples from a specific database
        """
        possible_source_databases = ["ARRAY_EXPRESS", "GEO", "SRA"]

        if options.get("source_database", None) is None:
            samples = Sample.objects.all()
        elif options["source_database"] in possible_source_databases:
            source_database = options["source_database"]
            samples = Sample.objects.filter(source_database=source_database)
        else:
            logger.error('Invalid source database "{}"'.format(
                options["source_database"]) +
                         "\nPossible source databases: {}".format(", ".join(
                             possible_source_databases)))
            sys.exit(1)

        paginator = PerformantPaginator(samples, PAGE_SIZE)
        page = paginator.page()

        while True:
            for sample in samples:
                logger.debug("Refreshing metadata for a sample.",
                             sample=sample.accession_code)
                if sample.source_database == "SRA":
                    metadata = SraSurveyor.gather_all_metadata(
                        sample.accession_code)
                    SraSurveyor._apply_harmonized_metadata_to_sample(
                        sample, metadata)
                elif sample.source_database == "GEO":
                    gse = GEOparse.get_GEO(
                        sample.experiments.first().accession_code,
                        destdir="/tmp/management",
                        how="brief",
                        silent=True,
                    )
                    preprocessed_samples = harmony.preprocess_geo(
                        gse.gsms.items())
                    harmonized_samples = harmony.harmonize(
                        preprocessed_samples)
                    GeoSurveyor._apply_harmonized_metadata_to_sample(
                        sample, harmonized_samples[sample.title])
                elif sample.source_database == "ARRAY_EXPRESS":
                    SDRF_URL_TEMPLATE = (
                        "https://www.ebi.ac.uk/arrayexpress/files/{code}/{code}.sdrf.txt"
                    )
                    sdrf_url = SDRF_URL_TEMPLATE.format(
                        code=sample.experiments.first().accession_code)
                    sdrf_samples = harmony.parse_sdrf(sdrf_url)
                    harmonized_samples = harmony.harmonize(sdrf_samples)
                    ArrayExpressSurveyor._apply_harmonized_metadata_to_sample(
                        sample, harmonized_samples[sample.title])

                sample.save()

            if not page.has_next():
                break
            else:
                page = paginator.page(page.next_page_number())

            # 2000 samples queued up every five minutes should be fast
            # enough and also not thrash the DB.
            time.sleep(60 * 5)