def handle(self, *args, **options):
        if options["job_id"] is None:
            logger.error("You must specify a job ID.",
                         job_id=options["job_id"])
            sys.exit(1)

        try:
            job_type = ProcessorPipeline[options["job_name"]]
        except KeyError:
            logger.error("You must specify a valid job name.",
                         job_name=options["job_name"],
                         job_id=options["job_id"])
            sys.exit(1)

        if job_type is ProcessorPipeline.AFFY_TO_PCL:
            from data_refinery_workers.processors.array_express import affy_to_pcl
            affy_to_pcl(options["job_id"])
        elif job_type is ProcessorPipeline.TRANSCRIPTOME_INDEX_SHORT:
            from data_refinery_workers.processors.transcriptome_index import build_transcriptome_index
            build_transcriptome_index(options["job_id"], length="short")
        elif job_type is ProcessorPipeline.TRANSCRIPTOME_INDEX_LONG:
            from data_refinery_workers.processors.transcriptome_index import build_transcriptome_index
            build_transcriptome_index(options["job_id"], length="long")
        elif job_type is ProcessorPipeline.AGILENT_TWOCOLOR_TO_PCL:
            from data_refinery_workers.processors.agilent_twocolor import agilent_twocolor_to_pcl
            agilent_twocolor_to_pcl(options["job_id"])
        elif job_type is ProcessorPipeline.ILLUMINA_TO_PCL:
            from data_refinery_workers.processors.illumina import illumina_to_pcl
            illumina_to_pcl(options["job_id"])
        elif job_type is ProcessorPipeline.SALMON:
            from data_refinery_workers.processors.salmon import salmon
            salmon(options["job_id"])
        elif job_type is ProcessorPipeline.SMASHER:
            from data_refinery_workers.processors.smasher import smash
            smash(options["job_id"])
        elif job_type is ProcessorPipeline.NO_OP:
            from data_refinery_workers.processors.no_op import no_op_processor
            no_op_processor(options["job_id"])
        elif job_type is ProcessorPipeline.JANITOR:
            from data_refinery_workers.processors.janitor import run_janitor
            run_janitor(options["job_id"])
        elif job_type is ProcessorPipeline.QN_REFERENCE:
            from data_refinery_workers.processors import qn_reference
            qn_reference.create_qn_reference(options["job_id"])
        else:
            logger.error(
                ("A valid job name was specified for job %s with id %d but "
                 "no processor function is known to run it."),
                options["job_name"], options["job_id"])
            sys.exit(1)

        sys.exit(0)
def create_qn_target(organism, platform, create_results=True):
    sample_codes_results = Sample.processed_objects.filter(
        platform_accession_code=platform,
        has_raw=True,
        technology="MICROARRAY",
        organism=organism,
        is_processed=True,
    ).values("accession_code")
    sample_codes = [res["accession_code"] for res in sample_codes_results]

    dataset = Dataset()
    dataset.data = {organism.name + "_(" + platform + ")": sample_codes}
    dataset.aggregate_by = "ALL"
    dataset.scale_by = "NONE"
    dataset.quantile_normalize = False
    dataset.save()

    job = ProcessorJob()
    job.pipeline_applied = "QN_REFERENCE"
    job.save()

    pjda = ProcessorJobDatasetAssociation()
    pjda.processor_job = job
    pjda.dataset = dataset
    pjda.save()

    return qn_reference.create_qn_reference(job.pk, create_results=create_results)
    def handle(self, *args, **options):
        """ """

        if not options["job_id"]:
            if options["organism"] is None and not options["all"]:
                logger.error("You must specify an organism or --all")
                sys.exit(1)

            if options["organism"] and (options.get("organism", "") != "ALL"):
                organisms = [Organism.get_object_for_name(options["organism"].upper())]
            else:
                organisms = Organism.objects.all()

            for organism in organisms:
                if not organism_can_have_qn_target(organism, options["min"]):
                    logger.error(
                        "Organism does not have any platform with enough samples to generate a qn target",
                        organism=organism,
                        min=options["min"],
                    )
                    continue

                if options["platform"] is None:
                    biggest_platform = get_biggest_platform(organism)
                    if biggest_platform is None:
                        logger.error("No processed samples for organism.", organism=organism)
                        continue
                else:
                    biggest_platform = options["platform"]

                final_context = create_qn_target(organism, platform=biggest_platform)

                if final_context["success"]:
                    print(":D")
                    self.stdout.write("Target file: " + final_context["target_file"])
                    self.stdout.write(
                        "Target S3: " + str(final_context["computed_files"][0].get_s3_url())
                    )
                else:
                    print(":(")
        else:
            qn_reference.create_qn_reference(options["job_id"])
示例#4
0
    def test_qn_reference(self):
        job = ProcessorJob()
        job.pipeline_applied = "QN_REFERENCE"
        job.save()

        homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606)
        homo_sapiens.save()

        experiment = Experiment()
        experiment.accession_code = "12345"
        experiment.save()
        # We don't have a 0.tsv
        codes = [str(i) for i in range(1, 201)]

        for code in codes:
            sample = Sample()
            sample.accession_code = code
            sample.title = code
            sample.platform_accession_code = "A-MEXP-1171"
            sample.manufacturer = "SLIPPERY DICK'S DISCOUNT MICROARRAYS"
            sample.organism = homo_sapiens
            sample.technology = "MICROARRAY"
            sample.is_processed = True
            sample.save()

            cr = ComputationalResult()
            cr.save()

            computed_file = ComputedFile()
            computed_file.filename = code + ".tsv"
            computed_file.absolute_file_path = "/home/user/data_store/QN/" + code + ".tsv"
            computed_file.size_in_bytes = int(code)
            computed_file.result = cr
            computed_file.is_smashable = True
            computed_file.save()

            scfa = SampleComputedFileAssociation()
            scfa.sample = sample
            scfa.computed_file = computed_file
            scfa.save()

            exsa = ExperimentSampleAssociation()
            exsa.experiment = experiment
            exsa.sample = sample
            exsa.save()

        dataset = Dataset()
        dataset.data = {"12345": ["1", "2", "3", "4", "5", "6"]}
        dataset.aggregate_by = "ALL"
        dataset.scale_by = "NONE"
        dataset.quantile_normalize = False  # We don't QN because we're creating the target now
        dataset.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = dataset
        pjda.save()

        final_context = qn_reference.create_qn_reference(job.pk)
        self.assertTrue(final_context["success"])
        self.assertTrue(os.path.exists(final_context["target_file"]))
        self.assertEqual(os.path.getsize(final_context["target_file"]), 562)

        homo_sapiens.refresh_from_db()
        target = homo_sapiens.qn_target.computedfile_set.latest()
        self.assertEqual(target.sha1, "de69d348f8b239479e2330d596c4013a7b0b2b6a")

        # Create and run a smasher job that will use the QN target we just made.
        pj = ProcessorJob()
        pj.pipeline_applied = "SMASHER"
        pj.save()

        ds = Dataset()
        ds.data = {"12345": ["1", "2", "3", "4", "5"]}
        ds.aggregate_by = "SPECIES"
        ds.scale_by = "STANDARD"
        ds.email_address = "*****@*****.**"
        ds.quantile_normalize = True
        ds.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = pj
        pjda.dataset = ds
        pjda.save()

        final_context = smasher.smash(pj.pk, upload=False)
        self.assertTrue(final_context["success"])

        np.testing.assert_almost_equal(final_context["merged_qn"]["1"][0], -0.4379488527774811)
        np.testing.assert_almost_equal(final_context["original_merged"]["1"][0], -0.5762109)
    def test_qn_reference(self):
        # We don't have a 0.tsv
        experiment = prepare_experiment(range(1, 201))

        job = ProcessorJob()
        job.pipeline_applied = "QN_REFERENCE"
        job.save()

        dataset = Dataset()
        dataset.data = {"12345": ["1", "2", "3", "4", "5", "6"]}
        dataset.aggregate_by = "ALL"
        dataset.scale_by = "NONE"
        dataset.quantile_normalize = False  # We don't QN because we're creating the target now
        dataset.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = dataset
        pjda.save()

        final_context = qn_reference.create_qn_reference(job.pk)
        self.assertTrue(final_context["success"])
        self.assertTrue(os.path.exists(final_context["target_file"]))
        self.assertEqual(os.path.getsize(final_context["target_file"]), 562)

        homo_sapiens = Organism.objects.get(taxonomy_id=9606)
        target = homo_sapiens.qn_target.computedfile_set.latest()
        self.assertEqual(target.sha1,
                         "de69d348f8b239479e2330d596c4013a7b0b2b6a")

        # Create and run a smasher job that will use the QN target we just made.
        pj = ProcessorJob()
        pj.pipeline_applied = "SMASHER"
        pj.save()

        ds = Dataset()
        ds.data = {"12345": ["1", "2", "3", "4", "5"]}
        ds.aggregate_by = "SPECIES"
        ds.scale_by = "STANDARD"
        ds.email_address = "*****@*****.**"
        ds.quantile_normalize = True
        ds.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = pj
        pjda.dataset = ds
        pjda.save()

        final_context = smasher.smash(pj.pk, upload=False)
        self.assertTrue(final_context["success"])

        np.testing.assert_almost_equal(final_context["merged_qn"]["1"][0],
                                       -0.4379488527774811)
        np.testing.assert_almost_equal(
            final_context["original_merged"]["1"][0], -0.5762109)

        # Make sure that the results were created. We create 200 computed files
        # and computational results (1 for each sample) plus the one generated
        # by the QN reference processor.
        self.assertEqual(ComputedFile.objects.all().count(), 200 + 1)
        self.assertEqual(ComputationalResult.objects.all().count(), 200 + 1)
        self.assertEqual(ComputationalResultAnnotation.objects.all().count(),
                         1)
示例#6
0
    def handle(self, *args, **options):
        """
        """

        if not options["job_id"]:
            if options["organism"] is None and not options["all"]:
                logger.error("You must specify an organism or --all")
                sys.exit(1)

            if options["organism"] and (options.get("organism", "") != "ALL"):
                organisms = [
                    Organism.get_object_for_name(options["organism"].upper())
                ]
            else:
                organisms = Organism.objects.all()

            for organism in organisms:
                if not organism_can_have_qn_target(organism):
                    logger.error(
                        "Organism does not have any platform with enough samples to generate a qn target",
                        organism=organism,
                        min=options["min"],
                    )
                    continue

                samples = organism.sample_set.filter(has_raw=True,
                                                     technology="MICROARRAY",
                                                     is_processed=True)
                if samples.count() == 0:
                    logger.error(
                        "No processed samples for organism.",
                        organism=organism,
                        count=samples.count(),
                    )
                    continue

                if options["platform"] is None:
                    platform_counts = (
                        samples.values("platform_accession_code").annotate(
                            dcount=Count("platform_accession_code")).order_by(
                                "-dcount"))
                    biggest_platform = platform_counts[0][
                        "platform_accession_code"]
                else:
                    biggest_platform = options["platform"]

                sample_codes_results = Sample.processed_objects.filter(
                    platform_accession_code=biggest_platform,
                    has_raw=True,
                    technology="MICROARRAY",
                    organism=organism,
                    is_processed=True,
                ).values("accession_code")
                sample_codes = [
                    res["accession_code"] for res in sample_codes_results
                ]

                dataset = Dataset()
                dataset.data = {
                    organism.name + "_(" + biggest_platform + ")": sample_codes
                }
                dataset.aggregate_by = "ALL"
                dataset.scale_by = "NONE"
                dataset.quantile_normalize = False
                dataset.save()

                job = ProcessorJob()
                job.pipeline_applied = "QN_REFERENCE"
                job.save()

                pjda = ProcessorJobDatasetAssociation()
                pjda.processor_job = job
                pjda.dataset = dataset
                pjda.save()

                final_context = qn_reference.create_qn_reference(job.pk)

                if final_context["success"]:
                    print(":D")
                    self.stdout.write("Target file: " +
                                      final_context["target_file"])
                    self.stdout.write(
                        "Target S3: " +
                        str(final_context["computed_files"][0].get_s3_url()))
                else:
                    print(":(")
        else:
            qn_reference.create_qn_reference(options["job_id"])
示例#7
0
    def test_qn_reference(self):
        job = ProcessorJob()
        job.pipeline_applied = "QN_REFERENCE"
        job.save()

        homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS")

        experiment = Experiment()
        experiment.accession_code = "12345"
        experiment.save()

        for code in ['1', '2', '3', '4', '5', '6']:
            sample = Sample()
            sample.accession_code = code
            sample.title = code
            sample.platform_accession_code = 'A-MEXP-1171'
            sample.manufacturer = "SLIPPERY DICK'S DISCOUNT MICROARRAYS"
            sample.organism = homo_sapiens
            sample.technology = "MICROARRAY"
            sample.is_processed = True
            sample.save()

            cr = ComputationalResult()
            cr.save()

            file = ComputedFile()
            file.filename = code + ".tsv"
            file.absolute_file_path = "/home/user/data_store/QN/" + code + ".tsv"
            file.size_in_bytes = int(code)
            file.result = cr
            file.is_smashable = True
            file.save()

            scfa = SampleComputedFileAssociation()
            scfa.sample = sample
            scfa.computed_file = file
            scfa.save()

            exsa = ExperimentSampleAssociation()
            exsa.experiment = experiment
            exsa.sample = sample
            exsa.save()

        
        dataset = Dataset()
        dataset.data = {"12345": ["1", "2", "3", "4", "5", "6"]}
        dataset.aggregate_by = "ALL"
        dataset.scale_by = "NONE"
        dataset.quantile_normalize = False # We don't QN because we're creating the target now
        dataset.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = dataset
        pjda.save()

        final_context = qn_reference.create_qn_reference(job.pk)
        self.assertTrue(final_context['success'])

        self.assertTrue(os.path.exists(final_context['target_file']))
        self.assertEqual(os.path.getsize(final_context['target_file']), 556)

        target = utils.get_most_recent_qn_target_for_organism(homo_sapiens)
        self.assertEqual(target.sha1, '636d72d5cbf4b9785b0bd271a1430b615feaa7ea')

        ###
        # Smasher with QN
        ###

        pj = ProcessorJob()
        pj.pipeline_applied = "SMASHER"
        pj.save()

        ds = Dataset()
        ds.data = {"12345": ["1", "2", "3", "4", "5"]}
        ds.aggregate_by = 'SPECIES'
        ds.scale_by = 'STANDARD'
        ds.email_address = "*****@*****.**"
        ds.quantile_normalize = True
        ds.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = pj
        pjda.dataset = ds
        pjda.save()

        final_context = smasher.smash(pj.pk, upload=False)
        self.assertTrue(final_context['success'])

        self.assertEqual(final_context['merged_qn']['1'][0], -0.4379488528812934)
        self.assertEqual(final_context['original_merged']['1'][0], -0.576210936113982)

        ## 
        # Test via management command
        ##

        from django.core.management import call_command
        from django.test import TestCase
        from django.utils.six import StringIO

        out = StringIO()
        try:
            call_command('create_qn_target', organism='homo_sapiens', min=1, stdout=out)
        except SystemExit as e: # this is okay!
            pass

        stdout = out.getvalue()
        self.assertTrue('Target file' in stdout)
        path = stdout.split('\n')[0].split(':')[1].strip()
        self.assertTrue(os.path.exists(path))
        self.assertEqual(path, utils.get_most_recent_qn_target_for_organism(homo_sapiens).absolute_file_path)
示例#8
0
    def handle(self, *args, **options):
        """
        """

        if options["organism"] is None and not options["all"]:
            logger.error("You must specify an organism or --all")
            sys.exit(1)

        if options["organism"] and (options.get('organism', '') != "ALL"):
            organisms = [
                Organism.get_object_for_name(options["organism"].upper())
            ]
        else:
            organisms = Organism.objects.all()

        for organism in organisms:
            samples = Sample.processed_objects.filter(organism=organism,
                                                      has_raw=True,
                                                      technology="MICROARRAY",
                                                      is_processed=True)
            if samples.count() == 0:
                logger.error("No processed samples for organism.",
                             organism=organism,
                             count=samples.count())
                continue
            if samples.count() < options['min']:
                logger.error(
                    "Proccessed samples don't meet minimum threshhold",
                    organism=organism,
                    count=samples.count(),
                    min=options["min"])
                continue

            if options["platform"] is None:
                platform_counts = samples.values(
                    'platform_accession_code').annotate(dcount=Count(
                        'platform_accession_code')).order_by('-dcount')
                biggest_platform = platform_counts[0][
                    'platform_accession_code']
            else:
                biggest_platform = options["platform"]

            sample_codes_results = Sample.processed_objects.filter(
                platform_accession_code=biggest_platform,
                has_raw=True,
                technology="MICROARRAY",
                is_processed=True).values('accession_code')
            sample_codes = [
                res['accession_code'] for res in sample_codes_results
            ]

            dataset = Dataset()
            dataset.data = {
                organism.name + '_(' + biggest_platform + ')': sample_codes
            }
            dataset.aggregate_by = "ALL"
            dataset.scale_by = "NONE"
            dataset.quantile_normalize = False
            dataset.save()

            job = ProcessorJob()
            job.pipeline_applied = "QN_REFERENCE"
            job.save()

            pjda = ProcessorJobDatasetAssociation()
            pjda.processor_job = job
            pjda.dataset = dataset
            pjda.save()

            final_context = qn_reference.create_qn_reference(job.pk)

            if final_context['success']:
                print(":D")
                self.stdout.write("Target file: " +
                                  final_context['target_file'])
                self.stdout.write(
                    "Target S3: " +
                    str(final_context['computed_files'][0].get_s3_url()))
            else:
                print(":(")