Exemplo n.º 1
0
def create_processor_job(pipeline="AFFY_TO_PCL",
                         ram_amount=2048,
                         start_time=None):
    og_file_1 = OriginalFile()
    og_file_1.source_filename = "doesn't matter"
    og_file_1.filename = "this either"
    og_file_1.absolute_file_path = "nor this"
    og_file_1.save()

    og_file_2 = OriginalFile()
    og_file_2.source_filename = "doesn't matter"
    og_file_2.filename = "this either"
    og_file_2.absolute_file_path = "nor this"
    og_file_2.save()

    downloader_job = None
    if pipeline == "AFFY_TO_PCL":
        downloader_job = DownloaderJob(
            downloader_task="SRA",
            batch_job_id="DEFAULT",
            num_retries=0,
            accession_code="NUNYA",
            success=None,
        )
        downloader_job.save()

        assoc = DownloaderJobOriginalFileAssociation()
        assoc.original_file = og_file_2
        assoc.downloader_job = downloader_job
        assoc.save()

        assoc1 = DownloaderJobOriginalFileAssociation()
        assoc1.original_file = og_file_1
        assoc1.downloader_job = downloader_job
        assoc1.save()

    processor_job = ProcessorJob(
        downloader_job=downloader_job,
        pipeline_applied=pipeline,
        batch_job_id="PROCESSOR/dispatch-1528945054-e8eaf540",
        ram_amount=ram_amount,
        num_retries=0,
        success=None,
        start_time=start_time,
    )
    processor_job.save()

    assoc1 = ProcessorJobOriginalFileAssociation()
    assoc1.original_file = og_file_1
    assoc1.processor_job = processor_job
    assoc1.save()

    assoc = ProcessorJobOriginalFileAssociation()
    assoc.original_file = og_file_2
    assoc.processor_job = processor_job
    assoc.save()

    return processor_job
Exemplo n.º 2
0
def prepare_job():
    pj = ProcessorJob()
    pj.pipeline_applied = "AFFY_TO_PCL"
    pj.save()

    original_file = OriginalFile()
    original_file.source_filename = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip"
    original_file.filename = "GSM1426071_CD_colon_active_1.CEL"
    original_file.absolute_file_path = "/home/user/data_store/raw/TEST/CEL/GSM1426071_CD_colon_active_1.CEL"
    original_file.save()

    assoc1 = ProcessorJobOriginalFileAssociation()
    assoc1.original_file = original_file
    assoc1.processor_job = pj
    assoc1.save()

    c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS")

    sample = Sample()
    sample.title = "Heyo"
    sample.organism = c_elegans
    sample.is_processed = False
    sample.save()

    ogsa = OriginalFileSampleAssociation()
    ogsa.sample = sample
    ogsa.original_file = original_file
    ogsa.save()

    return pj
Exemplo n.º 3
0
def prepare_dotsra_job(filename="ERR1562482.sra"):
    pj = ProcessorJob()
    pj.pipeline_applied = "SALMON"
    pj.id = random.randint(111, 999999)
    pj.save()

    c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS")

    samp = Sample()
    samp.accession_code = "SALMON" # So the test files go to the right place
    samp.organism = c_elegans
    samp.save()

    prepare_organism_indices()

    og_file = OriginalFile()
    og_file.source_filename = filename
    og_file.filename = filename
    og_file.absolute_file_path = "/home/user/data_store/raw/TEST/SALMON/" + filename
    og_file.save()

    og_file_samp_assoc = OriginalFileSampleAssociation()
    og_file_samp_assoc.original_file = og_file
    og_file_samp_assoc.sample = samp
    og_file_samp_assoc.save()

    assoc1 = ProcessorJobOriginalFileAssociation()
    assoc1.original_file = og_file
    assoc1.processor_job = pj
    assoc1.save()

    return pj, [og_file]
def create_processor_job_for_original_files(original_files: List[OriginalFile],
                                            volume_index: int):
    """
    Create a processor job and queue a processor task for sample related to an experiment.
    """
    # If there's no original files then we've created all the jobs we need to!
    if len(original_files) == 0:
        return
    # For anything that has raw data there should only be one Sample per OriginalFile
    sample_object = original_files[0].samples.first()
    pipeline_to_apply = determine_processor_pipeline(sample_object,
                                                     original_files[0])
    if pipeline_to_apply == ProcessorPipeline.NONE:
        logger.info("No valid processor pipeline found to apply to sample.",
                    sample=sample_object.id,
                    original_file=original_files[0].id)
        for original_file in original_files:
            original_file.delete_local_file()
            original_file.is_downloaded = False
            original_file.save()
    else:
        processor_job = ProcessorJob()
        processor_job.pipeline_applied = pipeline_to_apply.value
        processor_job.ram_amount = determine_ram_amount(
            sample_object, processor_job)
        processor_job.volume_index = volume_index
        processor_job.save()
        for original_file in original_files:
            assoc = ProcessorJobOriginalFileAssociation()
            assoc.original_file = original_file
            assoc.processor_job = processor_job
            assoc.save()
        logger.debug("Queuing processor job.", processor_job=processor_job.id)
        send_job(pipeline_to_apply, processor_job)
Exemplo n.º 5
0
def run_tximport():
    """Creates a tximport job for all eligible experiments."""
    eligible_experiments = (Experiment.objects.annotate(
        num_organisms=Count("organisms")).filter(
            num_organisms=1, technology="RNA-SEQ",
            num_processed_samples=0).prefetch_related("samples__results"))

    paginator = Paginator(eligible_experiments, PAGE_SIZE)
    page = paginator.page()

    # Next is to figure out how many samples were processed for
    # each experiment. Should be able to reuse code from salmon
    # cause it does this stuff.
    tximport_pipeline = ProcessorPipeline.TXIMPORT

    while True:
        creation_count = 0

        for experiment in page.object_list:
            quant_results = get_quant_results_for_experiment(experiment)

            if should_run_tximport(experiment, quant_results, True):
                processor_job = ProcessorJob()
                processor_job.pipeline_applied = tximport_pipeline.value
                processor_job.ram_amount = 8192
                # This job doesn't need to run on a specific volume
                # but it uses the same Nomad job as Salmon jobs which
                # do require the volume index.
                processor_job.volume_index = random.choice(
                    list(get_active_volumes()))
                processor_job.save()

                assoc = ProcessorJobOriginalFileAssociation()
                # Any original file linked to any sample of the
                # experiment will work. Tximport is somewhat special
                # in that it doesn't actuallhy use original files so
                # this is just used to point to the experiment.
                assoc.original_file = experiment.samples.all(
                )[0].original_files.all()[0]
                assoc.processor_job = processor_job
                assoc.save()

                creation_count += 1

                try:
                    send_job(tximport_pipeline, processor_job)
                except Exception:
                    # If we cannot queue the job now the Foreman will do
                    # it later.
                    pass

        logger.info(
            "Created %d tximport jobs for experiments past the thresholds.",
            creation_count)

        if not page.has_next():
            break
        else:
            page = paginator.page(page.next_page_number())
Exemplo n.º 6
0
def prepare_illumina_job(organism):
    pj = ProcessorJob()
    pj.pipeline_applied = "ILLUMINA_TO_PCL"
    pj.save()

    og_file = OriginalFile()
    og_file.source_filename = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE22nnn/GSE22427/suppl/GSE22427%5Fnon%2Dnormalized%2Etxt.gz"
    og_file.filename = "GSE22427_non-normalized.txt"
    og_file.absolute_file_path = (
        "/home/user/data_store/raw/TEST/ILLUMINA/GSE22427_non-normalized.txt")
    og_file.is_downloaded = True
    og_file.save()

    assoc1 = ProcessorJobOriginalFileAssociation()
    assoc1.original_file = og_file
    assoc1.processor_job = pj
    assoc1.save()

    sample_names = [
        "LV-C&si-Control-1",
        "LV-C&si-Control-2",
        "LV-C&si-Control-3",
        "LV-C&si-EZH2-1",
        "LV-C&si-EZH2-2",
        "LV-C&si-EZH2-3",
        "LV-EZH2&si-EZH2-1",
        "LV-EZH2&si-EZH2-2",
        "LV-EZH2&si-EZH2-3",
        "LV-T350A&si-EZH2-1",
        "LV-T350A&si-EZH2-2",
        "LV-T350A&si-EZH2-3",
    ]

    for name in sample_names:
        sample = Sample()
        sample.accession_code = name
        sample.title = name
        sample.organism = organism
        sample.save()

        sa = SampleAnnotation()
        sa.sample = sample
        sa.data = {"description": [name]}
        sa.is_ccdl = False
        sa.save()

        sample_assoc = OriginalFileSampleAssociation()
        sample_assoc.original_file = og_file
        sample_assoc.sample = sample
        sample_assoc.save()

    sample = Sample.objects.get(title="LV-T350A&si-EZH2-3")
    sample.title = "ignoreme_for_description"
    sample.accession_code = "ignoreme_for_description"
    sample.save()

    return pj
Exemplo n.º 7
0
def prepare_illumina_job(job_info: Dict) -> ProcessorJob:
    pj = ProcessorJob()
    pj.pipeline_applied = "ILLUMINA_TO_PCL"
    pj.save()

    og_file = OriginalFile()
    og_file.source_filename = job_info["source_filename"]
    og_file.filename = job_info["filename"]
    og_file.absolute_file_path = job_info["absolute_file_path"]
    og_file.is_downloaded = True
    og_file.save()

    assoc1 = ProcessorJobOriginalFileAssociation()
    assoc1.original_file = og_file
    assoc1.processor_job = pj
    assoc1.save()

    for s in job_info["samples"]:
        # For convenience, if you give a list of strings we'll just use the
        # strings as both titles and accessions.
        annotation = None
        if type(s) == str:
            accession_code = s
            title = s
        elif type(s) == tuple and list(map(type, s)) == [str, str]:
            accession_code, title = s
        elif type(s) == tuple and list(map(type, s)) == [str, str, dict]:
            accession_code, title, annotation = s
        else:
            raise ValueError(f"Invalid sample type for sample {s}")

        sample = Sample()
        sample.accession_code = accession_code
        sample.title = title
        sample.organism = job_info["organism"]
        sample.save()

        sa = SampleAnnotation()
        sa.sample = sample
        sa.data = annotation if annotation is not None else {
            "description": [title]
        }
        sa.is_ccdl = False
        sa.save()

        sample_assoc = OriginalFileSampleAssociation()
        sample_assoc.original_file = og_file
        sample_assoc.sample = sample
        sample_assoc.save()

    return pj
Exemplo n.º 8
0
    def test_convert_processed_illumina(self):
        job = ProcessorJob()
        job.pipeline_applied = "NO_OP"
        job.save()

        # ex:
        # Reporter Identifier VALUE   Detection Pval
        # ILMN_1343291    14.943602   0
        # ILMN_1343295    13.528082   0
        og_file = OriginalFile()
        og_file.source_filename = "https://www.ebi.ac.uk/arrayexpress/experiments/E-GEOD-22433/"
        og_file.filename = "GSM557500_sample_table.txt"
        og_file.absolute_file_path = (
            "/home/user/data_store/raw/TEST/NO_OP/GSM557500_sample_table.txt")
        og_file.is_downloaded = True
        og_file.save()

        homo_sapiens = Organism(name="HOMO_SAPIENS",
                                taxonomy_id=9606,
                                is_scientific_name=True)
        homo_sapiens.save()

        sample = Sample()
        sample.accession_code = "GSM557500"
        sample.title = "GSM557500"
        sample.platform_accession_code = "A-MEXP-1171"
        sample.manufacturer = "ILLUMINA"
        sample.organism = homo_sapiens
        sample.save()

        assoc = OriginalFileSampleAssociation()
        assoc.original_file = og_file
        assoc.sample = sample
        assoc.save()

        assoc1 = ProcessorJobOriginalFileAssociation()
        assoc1.original_file = og_file
        assoc1.processor_job = job
        assoc1.save()

        # To:
        # ENSG00000156508 14.943602
        # ENSG00000111640 13.528082
        final_context = no_op.no_op_processor(job.pk)
        self.assertTrue(final_context["success"])
        self.assertTrue(os.path.exists(final_context["output_file_path"]))
        self.assertEqual(os.path.getsize(final_context["output_file_path"]),
                         920374)
        self.assertTrue(
            no_op.check_output_quality(final_context["output_file_path"]))
Exemplo n.º 9
0
    def test_convert_illumina_no_header(self):
        job = ProcessorJob()
        job.pipeline_applied = "NO_OP"
        job.save()

        # ex:
        # ILMN_1885639    10.0000 0.7931
        # ILMN_2209417    10.0000 0.2029
        # ILMN_1765401    152.0873    0.0000
        og_file = OriginalFile()
        og_file.source_filename = (
            "https://github.com/AlexsLemonade/refinebio/files/2255178/GSM1089291-tbl-1.txt"
        )
        og_file.filename = "GSM1089291-tbl-1.txt"
        og_file.absolute_file_path = "/home/user/data_store/raw/TEST/NO_OP/GSM1089291-tbl-1.txt"
        og_file.is_downloaded = True
        og_file.save()

        homo_sapiens = Organism(name="HOMO_SAPIENS",
                                taxonomy_id=9606,
                                is_scientific_name=True)
        homo_sapiens.save()

        sample = Sample()
        sample.accession_code = "GSM557500"
        sample.title = "GSM557500"
        sample.platform_accession_code = "A-MEXP-1171"
        sample.manufacturer = "ILLUMINA"
        sample.organism = homo_sapiens
        sample.save()

        assoc = OriginalFileSampleAssociation()
        assoc.original_file = og_file
        assoc.sample = sample
        assoc.save()

        assoc1 = ProcessorJobOriginalFileAssociation()
        assoc1.original_file = og_file
        assoc1.processor_job = job
        assoc1.save()

        # To:
        # ENSG00000105675 10
        # ENSG00000085721 152.0873
        # ENSG00000278494 152.0873
        final_context = no_op.no_op_processor(job.pk)
        self.assertTrue(final_context["success"])
        self.assertTrue(os.path.exists(final_context["output_file_path"]))
        self.assertEqual(os.path.getsize(final_context["output_file_path"]),
                         786207)
Exemplo n.º 10
0
    def test_good_detection(self):
        """GSE54661 appears to be mislabled (illuminaHumanv4) on GEO. Shows our detector works. """
        from data_refinery_workers.processors import illumina

        pj = ProcessorJob()
        pj.pipeline_applied = "ILLUMINA_TO_PCL"
        pj.save()

        og_file = OriginalFile()
        og_file.source_filename = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE54nnn/GSE54661/suppl/GSE54661%5Fnon%5Fnormalized%2Etxt%2Egz"
        og_file.filename = "GSE54661_non_normalized.txt"
        og_file.absolute_file_path = (
            "/home/user/data_store/raw/TEST/ILLUMINA/GSE54661_non_normalized.txt"
        )
        og_file.is_downloaded = True
        og_file.save()

        assoc1 = ProcessorJobOriginalFileAssociation()
        assoc1.original_file = og_file
        assoc1.processor_job = pj
        assoc1.save()

        organism = Organism(name="HOMO_SAPIENS",
                            taxonomy_id=9606,
                            is_scientific_name=True)
        organism.save()

        sample = Sample()
        sample.accession_code = "ABCD-1234"
        sample.title = "hypoxia_Signal"
        sample.organism = organism
        sample.save()

        sample_assoc = OriginalFileSampleAssociation()
        sample_assoc.original_file = og_file
        sample_assoc.sample = sample
        sample_assoc.save()

        final_context = illumina.illumina_to_pcl(pj.pk)
        self.assertEqual(final_context["platform"], "illuminaHumanv3")

        for key in final_context["samples"][0].sampleannotation_set.all(
        )[0].data.keys():
            self.assertTrue(key in [
                "detected_platform", "detection_percentage",
                "mapped_percentage"
            ])

        # Cleanup after the job since it won't since we aren't running in cloud.
        shutil.rmtree(final_context["work_dir"], ignore_errors=True)
Exemplo n.º 11
0
    def test_convert_illumina_bad_cols(self):
        """
        In future, this test may be deprecated. For now it just alerts that it needs attention.
        """
        job = ProcessorJob()
        job.pipeline_applied = "NO_OP"
        job.save()

        # ex:
        # ILMN_1885639    10.0000 0.7931  11.0000 0.123
        # ILMN_2209417    10.0000 0.2029  11.1234 0.543
        # LMN_1765401    152.0873    0.0000  99.999  0.19
        og_file = OriginalFile()
        og_file.source_filename = (
            "https://github.com/AlexsLemonade/refinebio/files/2255178/GSM1089291-tbl-1-modified.txt"
        )
        og_file.filename = "GSM1089291-tbl-1-modified.txt"
        og_file.absolute_file_path = (
            "/home/user/data_store/raw/TEST/NO_OP/GSM1089291-tbl-1-modified.txt"
        )
        og_file.is_downloaded = True
        og_file.save()

        homo_sapiens = Organism(name="HOMO_SAPIENS",
                                taxonomy_id=9606,
                                is_scientific_name=True)
        homo_sapiens.save()

        sample = Sample()
        sample.accession_code = "GSM557500"
        sample.title = "GSM557500"
        sample.platform_accession_code = "A-MEXP-1171"
        sample.manufacturer = "ILLUMINA"
        sample.organism = homo_sapiens
        sample.save()

        assoc = OriginalFileSampleAssociation()
        assoc.original_file = og_file
        assoc.sample = sample
        assoc.save()

        assoc1 = ProcessorJobOriginalFileAssociation()
        assoc1.original_file = og_file
        assoc1.processor_job = job
        assoc1.save()

        final_context = no_op.no_op_processor(job.pk)
        self.assertFalse(final_context["success"])
        self.assertTrue("Tell Rich!" in final_context["job"].failure_reason)
Exemplo n.º 12
0
def prepare_non_ba_job():
    pj = ProcessorJob()
    pj.pipeline_applied = "AFFY_TO_PCL"
    pj.save()

    og_file = OriginalFile()
    og_file.source_filename = "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM45nnn/GSM45588/suppl/GSM45588.CEL.gz"
    og_file.filename = "GSM45588.CEL"
    og_file.absolute_file_path = "/home/user/data_store/raw/TEST/CEL/GSM45588.CEL"
    og_file.save()

    assoc1 = ProcessorJobOriginalFileAssociation()
    assoc1.original_file = og_file
    assoc1.processor_job = pj
    assoc1.save()

    return pj
Exemplo n.º 13
0
def prepare_agilent_twocolor_job():
    pj = ProcessorJob()
    pj.pipeline_applied = "AGILENT_TWOCOLOR_TO_PCL"
    pj.save()

    og_file = OriginalFile()
    og_file.source_filename = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE22900&format=file"
    og_file.filename = "GSM466597_95899_agilent.txt"
    og_file.absolute_file_path = "/home/user/data_store/raw/TEST/AGILENT_TWOCOLOR/GSM466597_95899_agilent.txt"
    og_file.save()

    assoc1 = ProcessorJobOriginalFileAssociation()
    assoc1.original_file = og_file
    assoc1.processor_job = pj
    assoc1.save()

    return pj
Exemplo n.º 14
0
def prepare_ba_job():
    pj = ProcessorJob()
    pj.pipeline_applied = "AFFY_TO_PCL"
    pj.save()

    og_file = OriginalFile()
    og_file.source_filename = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip"
    og_file.filename = "GSM1426071_CD_colon_active_1.CEL"
    og_file.absolute_file_path = "/home/user/data_store/raw/TEST/CEL/GSM1426071_CD_colon_active_1.CEL"
    og_file.save()

    assoc1 = ProcessorJobOriginalFileAssociation()
    assoc1.original_file = og_file
    assoc1.processor_job = pj
    assoc1.save()

    return pj
Exemplo n.º 15
0
    def handle(self, *args, **options):
        """Run tximport on an experiment for the samples that have been processed by Salmon.
        """
        if options["experiment_accession"] is None:
            logger.error(
                "The --experiment-accession argument must be provided")
            sys.exit(1)
        else:
            accession_code = options["experiment_accession"]

        # Find an OriginalFile associated with one of the Experiment's
        # samples which had Salmon run successfully on it.
        original_file = None
        experiment = Experiment.objects.get(accession_code=accession_code)
        for sample in experiment.samples.all():
            # Only need to loop until we actually find an
            # original_file with a successful job.
            if original_file:
                break

            pjs_for_sample = sample.get_processor_jobs()
            for processor_job in list(pjs_for_sample):
                if processor_job.success:
                    original_file = processor_job.original_files.first()
                    if original_file:
                        break

        if not original_file:
            logger.error(
                "Could not find a single sample in the experiment that had a successful Salmon job.",
                experiment=accession_code,
            )
            sys.exit(1)

        job = ProcessorJob()
        job.pipeline_applied = "TXIMPORT"
        job.save()

        pjofa = ProcessorJobOriginalFileAssociation()
        pjofa.processor_job = job
        pjofa.original_file = original_file
        pjofa.save()

        tximport.tximport(job.id)

        sys.exit(0)
Exemplo n.º 16
0
def create_processor_jobs_for_original_files(
        original_files: List[OriginalFile],
        downloader_job: DownloaderJob = None):
    """
    Create a processor jobs and queue a processor task for samples related to an experiment.
    """
    for original_file in original_files:
        sample_object = original_file.samples.first()

        if not delete_if_blacklisted(original_file):
            continue

        pipeline_to_apply = determine_processor_pipeline(
            sample_object, original_file)

        if pipeline_to_apply == ProcessorPipeline.NONE:
            logger.info(
                "No valid processor pipeline found to apply to sample.",
                sample=sample_object.id,
                original_file=original_files[0].id)
            original_file.delete_local_file()
            original_file.is_downloaded = False
            original_file.save()
        else:
            processor_job = ProcessorJob()
            processor_job.pipeline_applied = pipeline_to_apply.value
            processor_job.ram_amount = determine_ram_amount(
                sample_object, processor_job)
            processor_job.save()

            assoc = ProcessorJobOriginalFileAssociation()
            assoc.original_file = original_file
            assoc.processor_job = processor_job
            assoc.save()

            if downloader_job:
                logger.debug("Queuing processor job.",
                             processor_job=processor_job.id,
                             original_file=original_file.id,
                             downloader_job=downloader_job.id)
            else:
                logger.debug("Queuing processor job.",
                             processor_job=processor_job.id,
                             original_file=original_file.id)

            send_job(pipeline_to_apply, processor_job)
Exemplo n.º 17
0
def prepare_job(job_info: dict) -> ProcessorJob:
    job = ProcessorJob()
    job.pipeline_applied = "NO_OP"
    job.save()

    og_file = OriginalFile()
    og_file.source_filename = job_info["source_filename"]
    og_file.filename = job_info["filename"]
    og_file.absolute_file_path = job_info["absolute_file_path"]
    og_file.is_downloaded = True
    og_file.save()

    sample = Sample()
    sample.accession_code = job_info["accession_code"]
    sample.title = job_info["accession_code"]
    sample.platform_accession_code = job_info["platform_accession_code"]

    manufacturer = job_info.get("manufacturer", None)
    if manufacturer is not None:
        sample.manufacturer = manufacturer

    # The illumina samples need the human organism
    if manufacturer == "ILLUMINA":
        homo_sapiens = Organism(name="HOMO_SAPIENS",
                                taxonomy_id=9606,
                                is_scientific_name=True)
        homo_sapiens.save()
        sample.organism = homo_sapiens

    sample.save()

    assoc = OriginalFileSampleAssociation()
    assoc.original_file = og_file
    assoc.sample = sample
    assoc.save()

    assoc1 = ProcessorJobOriginalFileAssociation()
    assoc1.original_file = og_file
    assoc1.processor_job = job
    assoc1.save()

    return job
Exemplo n.º 18
0
def prepare_huex_v1_job():
    pj = ProcessorJob()
    pj.pipeline_applied = "AFFY_TO_PCL"
    pj.save()

    og_file = OriginalFile()
    og_file.source_filename = "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM1364nnn/GSM1364667/suppl/GSM1364667_U_110208_7-02-10_S2.CEL.gz"
    og_file.filename = "GSM1364667_U_110208_7-02-10_S2.CEL"
    og_file.absolute_file_path = (
        "/home/user/data_store/raw/TEST/CEL/GSM1364667_U_110208_7-02-10_S2.CEL"
    )
    og_file.is_downloaded = True
    og_file.save()

    assoc1 = ProcessorJobOriginalFileAssociation()
    assoc1.original_file = og_file
    assoc1.processor_job = pj
    assoc1.save()

    return pj
Exemplo n.º 19
0
    def test_no_job_created_when_failed_job_exists(self):
        experiment = setup_experiment([], ["GSM001"])

        # create a failed job for that experiment
        processor_job = ProcessorJob()
        processor_job.pipeline_applied = ProcessorPipeline.SALMON
        processor_job.ram_amount = 1024
        processor_job.success = False
        processor_job.retried = False
        processor_job.no_retry = False
        processor_job.save()

        assoc = ProcessorJobOriginalFileAssociation()
        assoc.original_file = experiment.samples.first().original_files.first()
        assoc.processor_job = processor_job
        assoc.save()

        # Run command
        update_salmon_all_experiments()

        dl_jobs = DownloaderJob.objects.all()
        self.assertEqual(dl_jobs.count(), 0)
Exemplo n.º 20
0
def run_tximport_if_eligible(experiment: Experiment,
                             dispatch_jobs=True) -> bool:
    """Checks if an experiment is eligible to have tximport run on it and creates a job for it.

    If the dispatch_jobs parameter is True a Batch job will be dispatched for it.

    Returns the ProcessorJob if a job was created or None if one was not.
    """
    tximport_pipeline = ProcessorPipeline.TXIMPORT

    if get_tximport_inputs_if_eligible(experiment, True):
        processor_job = ProcessorJob()
        processor_job.pipeline_applied = tximport_pipeline.value
        processor_job.ram_amount = 32768
        processor_job.save()

        assoc = ProcessorJobOriginalFileAssociation()
        # Any original file linked to any sample of the
        # experiment will work. Tximport is somewhat special
        # in that it doesn't actuallhy use original files so
        # this is just used to point to the experiment.
        assoc.original_file = experiment.samples.all()[0].original_files.all(
        )[0]
        assoc.processor_job = processor_job
        assoc.save()

        if dispatch_jobs:
            try:
                send_job(tximport_pipeline, processor_job)
            except Exception:
                # If we cannot queue the job now the Foreman will do
                # it later.
                pass

        return processor_job

    return None
Exemplo n.º 21
0
    def setUp(self):
        # Saving this for if we have protected endpoints
        # self.superuser = User.objects.create_superuser('john', '*****@*****.**', 'johnpassword')
        # self.client.login(username='******', password='******')
        # self.user = User.objects.create(username="******")

        experiment = Experiment()
        experiment.accession_code = "GSE000"
        experiment.alternate_accession_code = "E-GEOD-000"
        experiment.title = "NONONONO"
        experiment.description = "Boooooourns. Wasabi."
        experiment.technology = "RNA-SEQ"
        experiment.save()

        experiment = Experiment()
        experiment.accession_code = "GSE123"
        experiment.title = "Hey Ho Let's Go"
        experiment.description = (
            "This is a very exciting test experiment. Faygo soda. Blah blah blah."
        )
        experiment.technology = "MICROARRAY"
        experiment.save()
        self.experiment = experiment

        experiment_annotation = ExperimentAnnotation()
        experiment_annotation.data = {"hello": "world", "123": 456}
        experiment_annotation.experiment = experiment
        experiment_annotation.save()

        # Create 26 test organisms numbered 0-25 for pagination test, so there should be 29 organisms total (with the 3 others below)
        for i in range(26):
            Organism(name=("TEST_ORGANISM_{}".format(i)),
                     taxonomy_id=(1234 + i)).save()

        ailuropoda = Organism(name="AILUROPODA_MELANOLEUCA",
                              taxonomy_id=9646,
                              is_scientific_name=True)
        ailuropoda.save()
        self.homo_sapiens = Organism(name="HOMO_SAPIENS",
                                     taxonomy_id=9606,
                                     is_scientific_name=True)
        self.homo_sapiens.save()
        self.danio_rerio = Organism(name="DANIO_RERIO",
                                    taxonomy_id=1337,
                                    is_scientific_name=True)
        self.danio_rerio.save()

        sample = Sample()
        sample.title = "123"
        sample.accession_code = "123"
        sample.is_processed = True
        sample.organism = ailuropoda
        sample.save()

        sample = Sample()
        sample.title = "789"
        sample.accession_code = "789"
        sample.is_processed = True
        sample.organism = ailuropoda
        sample.save()
        self.sample = sample

        # add qn target for sample organism
        result = ComputationalResult()
        result.commands.append("create_qn_target.py")
        result.is_ccdl = True
        result.is_public = True
        result.processor = None
        result.save()

        cra = ComputationalResultAnnotation()
        cra.result = result
        cra.data = {"organism_id": ailuropoda.id, "is_qn": True}
        cra.save()

        ailuropoda.qn_target = result
        ailuropoda.save()

        sample_annotation = SampleAnnotation()
        sample_annotation.data = {"goodbye": "world", "789": 123}
        sample_annotation.sample = sample
        sample_annotation.save()

        original_file = OriginalFile()
        original_file.save()

        original_file_sample_association = OriginalFileSampleAssociation()
        original_file_sample_association.sample = sample
        original_file_sample_association.original_file = original_file
        original_file_sample_association.save()

        downloader_job = DownloaderJob()
        downloader_job.save()

        download_assoc = DownloaderJobOriginalFileAssociation()
        download_assoc.original_file = original_file
        download_assoc.downloader_job = downloader_job
        download_assoc.save()

        processor_job = ProcessorJob()
        processor_job.save()

        processor_assoc = ProcessorJobOriginalFileAssociation()
        processor_assoc.original_file = original_file
        processor_assoc.processor_job = processor_job
        processor_assoc.save()

        experiment_sample_association = ExperimentSampleAssociation()
        experiment_sample_association.sample = sample
        experiment_sample_association.experiment = experiment
        experiment_sample_association.save()
        experiment.num_total_samples = 1
        experiment.num_processed_samples = 1
        experiment.save()

        result = ComputationalResult()
        result.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        result = ComputationalResult()
        result.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        processor = Processor()
        processor.name = "Salmon Quant"
        processor.version = "v9.9.9"
        processor.docker_image = "dr_salmon"
        processor.environment = '{"some": "environment"}'
        processor.save()

        computational_result_short = ComputationalResult(processor=processor)
        computational_result_short.save()

        organism_index = OrganismIndex()
        organism_index.index_type = "TRANSCRIPTOME_SHORT"
        organism_index.organism = self.danio_rerio
        organism_index.result = computational_result_short
        organism_index.absolute_directory_path = (
            "/home/user/data_store/salmon_tests/TRANSCRIPTOME_INDEX/SHORT")
        organism_index.is_public = True
        organism_index.s3_url = "not_blank"
        organism_index.save()

        return
Exemplo n.º 22
0
    def test_organism_shepherd_command(self, mock_nomad, mock_send_job,
                                       mock_get_active_volumes):
        """Tests that the organism shepherd requeues jobs in the right order.

        The situation we're setting up is basically this:
          * There are two experiments.
          * One of them has 1/2 samples processed, the other 0/1
          * One of them needs a DownloaderJob requeued and the other
            needs a ProcessorJob requued.

        And what we're going to test for is:
          * Both of the jobs that need to be requeued are requeued.
          * The experiment with a processed sample is requeued first
            because it has a higher completion percentage.
        """
        # First, set up our mocks to prevent network calls.
        mock_send_job.return_value = True
        active_volumes = {"1", "2", "3"}
        mock_get_active_volumes.return_value = active_volumes

        def mock_init_nomad(host, port=0, timeout=0):
            ret_value = MagicMock()
            ret_value.jobs = MagicMock()
            ret_value.jobs.get_jobs = MagicMock()
            ret_value.jobs.get_jobs.side_effect = lambda: []
            return ret_value

        mock_nomad.side_effect = mock_init_nomad
        zebrafish = Organism(name="DANIO_RERIO",
                             taxonomy_id=1337,
                             is_scientific_name=True)
        zebrafish.save()

        # Experiment that is 0% complete.
        zero_percent_experiment = Experiment(accession_code='ERP037000')
        zero_percent_experiment.technology = 'RNA-SEQ'
        zero_percent_experiment.save()

        organism_assoc = ExperimentOrganismAssociation.objects.create(
            organism=zebrafish, experiment=zero_percent_experiment)

        zero_percent = OriginalFile()
        zero_percent.filename = "ERR037001.fastq.gz"
        zero_percent.source_filename = "ERR037001.fastq.gz"
        zero_percent.source_url = "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR037/ERR037001/ERR037001_1.fastq.gz"
        zero_percent.is_archive = True
        zero_percent.save()

        zero_percent_sample = Sample()
        zero_percent_sample.accession_code = 'ERR037001'
        zero_percent_sample.organism = zebrafish
        zero_percent_sample.save()

        assoc = OriginalFileSampleAssociation()
        assoc.sample = zero_percent_sample
        assoc.original_file = zero_percent
        assoc.save()

        assoc = ExperimentSampleAssociation()
        assoc.sample = zero_percent_sample
        assoc.experiment = zero_percent_experiment
        assoc.save()

        # TODO: fix names of all the variables to be appropriate for this test case.
        zero_percent_dl_job = DownloaderJob()
        zero_percent_dl_job.accession_code = zero_percent_sample.accession_code
        zero_percent_dl_job.downloader_task = "SRA"
        zero_percent_dl_job.start_time = timezone.now()
        zero_percent_dl_job.end_time = timezone.now()
        zero_percent_dl_job.success = False
        zero_percent_dl_job.save()

        assoc = DownloaderJobOriginalFileAssociation()
        assoc.downloader_job = zero_percent_dl_job
        assoc.original_file = zero_percent
        assoc.save()

        # Experiment that is 50% complete.
        fify_percent_experiment = Experiment(accession_code='ERP036000')
        fify_percent_experiment.technology = 'RNA-SEQ'
        fify_percent_experiment.save()

        organism_assoc = ExperimentOrganismAssociation.objects.create(
            organism=zebrafish, experiment=fify_percent_experiment)

        ## First sample, this one has been processed.
        successful_pj = ProcessorJob()
        successful_pj.accession_code = "ERR036000"
        successful_pj.pipeline_applied = "SALMON"
        successful_pj.ram_amount = 12288
        successful_pj.start_time = timezone.now()
        successful_pj.end_time = timezone.now()
        successful_pj.success = True
        successful_pj.save()

        successful_og = OriginalFile()
        successful_og.filename = "ERR036000.fastq.gz"
        successful_og.source_filename = "ERR036000.fastq.gz"
        successful_og.source_url = "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR036/ERR036000/ERR036000_1.fastq.gz"
        successful_og.is_archive = True
        successful_og.save()

        successful_sample = Sample()
        successful_sample.accession_code = 'ERR036000'
        successful_sample.organism = zebrafish
        successful_sample.save()

        assoc = OriginalFileSampleAssociation()
        assoc.sample = successful_sample
        assoc.original_file = successful_og
        assoc.save()

        assoc = ProcessorJobOriginalFileAssociation()
        assoc.processor_job = successful_pj
        assoc.original_file = successful_og
        assoc.save()

        assoc = ExperimentSampleAssociation()
        assoc.sample = successful_sample
        assoc.experiment = fify_percent_experiment
        assoc.save()

        ## Second sample, this one hasn't been processed.
        fifty_percent_unprocessed_og = OriginalFile()
        fifty_percent_unprocessed_og.filename = "ERR036001.fastq.gz"
        fifty_percent_unprocessed_og.source_filename = "ERR036001.fastq.gz"
        fifty_percent_unprocessed_og.source_url = "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR036/ERR036001/ERR036001_1.fastq.gz"
        fifty_percent_unprocessed_og.is_archive = True
        fifty_percent_unprocessed_og.save()

        fifty_percent_unprocessed_sample = Sample()
        fifty_percent_unprocessed_sample.accession_code = 'ERR036001'
        fifty_percent_unprocessed_sample.organism = zebrafish
        fifty_percent_unprocessed_sample.save()

        assoc = OriginalFileSampleAssociation()
        assoc.sample = fifty_percent_unprocessed_sample
        assoc.original_file = fifty_percent_unprocessed_og
        assoc.save()

        assoc = ExperimentSampleAssociation()
        assoc.sample = fifty_percent_unprocessed_sample
        assoc.experiment = fify_percent_experiment
        assoc.save()

        fifty_percent_processor_job = ProcessorJob()
        fifty_percent_processor_job.pipeline_applied = "SALMON"
        fifty_percent_processor_job.accession_code = fifty_percent_unprocessed_sample.accession_code
        fifty_percent_processor_job.ram_amount = 12288
        fifty_percent_processor_job.start_time = timezone.now()
        fifty_percent_processor_job.end_time = timezone.now()
        fifty_percent_processor_job.success = False
        fifty_percent_processor_job.save()

        assoc = ProcessorJobOriginalFileAssociation()
        assoc.processor_job = fifty_percent_processor_job
        assoc.original_file = fifty_percent_unprocessed_og
        assoc.save()

        # Setup is done, actually run the command.
        args = []
        options = {"organism_name": "DANIO_RERIO"}
        call_command("organism_shepherd", *args, **options)

        # Verify that the jobs were called in the correct order.
        mock_calls = mock_send_job.mock_calls

        first_call_job_type = mock_calls[0][1][0]
        first_call_job_object = mock_calls[0][2]["job"]
        self.assertEqual(first_call_job_type, ProcessorPipeline.SALMON)
        self.assertEqual(first_call_job_object.pipeline_applied,
                         fifty_percent_processor_job.pipeline_applied)
        self.assertEqual(first_call_job_object.ram_amount,
                         fifty_percent_processor_job.ram_amount)
        self.assertIn(first_call_job_object.volume_index, active_volumes)

        fifty_percent_processor_job.refresh_from_db()
        self.assertEqual(first_call_job_object,
                         fifty_percent_processor_job.retried_job)

        second_call_job_type = mock_calls[1][1][0]
        second_call_job_object = mock_calls[1][2]["job"]
        self.assertEqual(second_call_job_type, Downloaders.SRA)
        self.assertEqual(second_call_job_object.accession_code,
                         zero_percent_dl_job.accession_code)
        self.assertEqual(second_call_job_object.downloader_task,
                         zero_percent_dl_job.downloader_task)

        zero_percent_dl_job.refresh_from_db()
        self.assertEqual(second_call_job_object,
                         zero_percent_dl_job.retried_job)
def prepare_job(length):

    pj = ProcessorJob()
    pj.pipeline_applied = "TRANSCRIPTOME_INDEX_" + length.upper()
    pj.save()

    homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS")

    samp = Sample()
    samp.organism = homo_sapiens
    samp.accession_code = "derp" + length
    samp.save()

    og_file = OriginalFile()
    og_file.source_filename = "aegilops_tauschii_" + length + ".fa.gz"
    og_file.filename = "aegilops_tauschii_" + length + ".fa.gz"
    og_file.absolute_file_path = "/home/user/data_store/raw/TEST/TRANSCRIPTOME_INDEX/AEGILOPS_TAUSCHII/aegilops_tauschii_short.fa.gz"
    og_file.is_downloaded = True
    og_file.save()

    og_file2 = OriginalFile()
    og_file2.source_filename = "aegilops_tauschii_" + length + ".gtf.gz"
    og_file2.filename = "aegilops_tauschii_" + length + ".gtf.gz"
    og_file2.absolute_file_path = "/home/user/data_store/raw/TEST/TRANSCRIPTOME_INDEX/AEGILOPS_TAUSCHII/aegilops_tauschii_short.gtf.gz"
    og_file2.is_downloaded = True
    og_file2.save()

    og_file_samp_assoc = OriginalFileSampleAssociation()
    og_file_samp_assoc.original_file = og_file
    og_file_samp_assoc.sample = samp
    og_file_samp_assoc.save()

    og_file_samp_assoc2 = OriginalFileSampleAssociation()
    og_file_samp_assoc2.original_file = og_file2
    og_file_samp_assoc2.sample = samp
    og_file_samp_assoc2.save()

    assoc1 = ProcessorJobOriginalFileAssociation()
    assoc1.original_file = og_file
    assoc1.processor_job = pj
    assoc1.save()

    assoc2 = ProcessorJobOriginalFileAssociation()
    assoc2.original_file = og_file2
    assoc2.processor_job = pj
    assoc2.save()

    return pj
Exemplo n.º 24
0
def _detect_platform(job_context: Dict) -> Dict:
    """
    Determine the platform/database to process this sample with.
    They often provide something like "V2" or "V 2", but we don't trust them so we detect it ourselves.

    Related: https://github.com/AlexsLemonade/refinebio/issues/232
    """

    all_databases = {
        'HOMO_SAPIENS': [
            'illuminaHumanv1',
            'illuminaHumanv2',
            'illuminaHumanv3',
            'illuminaHumanv4',
        ],
        'MUS_MUSCULUS': [
            'illuminaMousev1',
            'illuminaMousev1p1',
            'illuminaMousev2',
        ],
        'RATTUS_NORVEGICUS': ['illuminaRatv1']
    }

    sample0 = job_context['samples'][0]
    databases = all_databases[sample0.organism.name]

    # Loop over all of the possible platforms and find the one with the best match.
    highest = 0.0
    high_mapped_percent = 0.0
    high_db = None
    for platform in databases:
        try:
            result = subprocess.check_output([
                "/usr/bin/Rscript",
                "--vanilla",
                "/home/user/data_refinery_workers/processors/detect_database.R",
                "--platform",
                platform,
                "--inputFile",
                job_context['input_file_path'],
                "--column",
                job_context['probeId'],
            ])

            results = result.decode().split('\n')
            cleaned_result = float(results[0].strip())

            if cleaned_result > highest:
                highest = cleaned_result
                high_db = platform
                high_mapped_percent = float(results[1].strip())

        except Exception as e:
            logger.exception(e, processor_job_id=job_context["job"].id)
            continue

    # Record our sample detection outputs for every sample.
    for sample in job_context['samples']:
        sa = SampleAnnotation()
        sa.sample = sample
        sa.is_ccdl = True
        sa.data = {
            "detected_platform": high_db,
            "detection_percentage": highest,
            "mapped_percentage": high_mapped_percent
        }
        sa.save()

    # If the match is over 75%, record this and process it on that platform.
    if high_mapped_percent > 75.0:
        job_context['platform'] = high_db
    # The match percentage is too low - send this to the no-opper instead.
    else:

        logger.info("Match percentage too low, NO_OP'ing and aborting.",
                    job=job_context['job_id'])

        processor_job = ProcessorJob()
        processor_job.pipeline_applied = "NO_OP"
        processor_job.volume_index = job_context["job"].volume_index
        processor_job.ram_amount = job_context["job"].ram_amount
        processor_job.save()

        assoc = ProcessorJobOriginalFileAssociation()
        assoc.original_file = job_context["original_files"][0]
        assoc.processor_job = processor_job
        assoc.save()

        try:
            send_job(ProcessorPipeline.NO_OP, processor_job)
        except Exception as e:
            # Nomad dispatch error, likely during local test.
            logger.error(e, job=processor_job)

        job_context['abort'] = True

    return job_context
Exemplo n.º 25
0
def setup_experiment() -> Dict:
    """ Create an experiment with two samples where one of them has a processor job that failed
    because Batch restarted it."""

    # Create the experiment
    experiment_accession = "SRP095529"
    experiment = Experiment.objects.create(accession_code=experiment_accession,
                                           technology="RNA-SEQ")

    zebrafish = Organism.get_object_for_name("DANIO_RERIO")

    accession_code = "S001"
    sample = Sample.objects.create(
        accession_code=accession_code,
        organism=zebrafish,
        source_database="SRA",
        technology="RNA-SEQ",
        platform_accession_code="IlluminaHiSeq1000",
    )
    ExperimentSampleAssociation.objects.create(experiment=experiment,
                                               sample=sample)

    original_file = OriginalFile()
    original_file.filename = accession_code + ".SRA"
    original_file.source_filename = accession_code + ".SRA"
    original_file.save()

    OriginalFileSampleAssociation.objects.get_or_create(
        original_file=original_file, sample=sample)

    accession_code = "S002"
    sample = Sample.objects.create(
        accession_code=accession_code,
        organism=zebrafish,
        source_database="SRA",
        technology="RNA-SEQ",
        platform_accession_code="IlluminaHiSeq1000",
    )
    ExperimentSampleAssociation.objects.create(experiment=experiment,
                                               sample=sample)

    original_file = OriginalFile()
    original_file.filename = accession_code + ".SRA"
    original_file.source_filename = accession_code + ".SRA"
    original_file.save()

    OriginalFileSampleAssociation.objects.get_or_create(
        original_file=original_file, sample=sample)

    # add a failed processor job for the second sample
    processor_job = ProcessorJob()
    processor_job.start_time = timezone.now()
    processor_job.end_time = timezone.now()
    processor_job.no_retry = True
    processor_job.success = False
    processor_job.failure_reason = (
        "ProcessorJob has already completed with a fail - why are we here again?"
    )
    processor_job.save()

    processor_assoc = ProcessorJobOriginalFileAssociation()
    processor_assoc.original_file = original_file
    processor_assoc.processor_job = processor_job
    processor_assoc.save()

    return experiment
Exemplo n.º 26
0
def create_processor_jobs_for_original_files(
        original_files: List[OriginalFile],
        downloader_job: DownloaderJob = None):
    """
    Create a processor jobs and queue a processor task for samples related to an experiment.
    """
    for original_file in original_files:
        sample_object = original_file.samples.first()

        if not delete_if_blacklisted(original_file):
            continue

        # Fix for: https://github.com/AlexsLemonade/refinebio/issues/968
        # Basically, we incorrectly detected technology/manufacturers
        # for many Affymetrix samples and this is a good place to fix
        # some of them.
        if original_file.is_affy_data():
            # Only Affymetrix Microarrays produce .CEL files
            sample_object.technology = 'MICROARRAY'
            sample_object.manufacturer = 'AFFYMETRTIX'
            sample_object.save()

        pipeline_to_apply = determine_processor_pipeline(
            sample_object, original_file)

        if pipeline_to_apply == ProcessorPipeline.NONE:
            logger.info(
                "No valid processor pipeline found to apply to sample.",
                sample=sample_object.id,
                original_file=original_files[0].id)
            original_file.delete_local_file()
            original_file.is_downloaded = False
            original_file.save()
        else:
            processor_job = ProcessorJob()
            processor_job.pipeline_applied = pipeline_to_apply.value
            processor_job.ram_amount = determine_ram_amount(
                sample_object, processor_job)
            processor_job.save()

            assoc = ProcessorJobOriginalFileAssociation()
            assoc.original_file = original_file
            assoc.processor_job = processor_job
            assoc.save()

            if downloader_job:
                logger.debug("Queuing processor job.",
                             processor_job=processor_job.id,
                             original_file=original_file.id,
                             downloader_job=downloader_job.id)
            else:
                logger.debug("Queuing processor job.",
                             processor_job=processor_job.id,
                             original_file=original_file.id)

            try:
                send_job(pipeline_to_apply, processor_job)
            except:
                # If we cannot queue the job now the Foreman will do
                # it later.
                pass
Exemplo n.º 27
0
    def setUpClass(cls):
        super(ESTestCases, cls).setUpClass()  # ref https://stackoverflow.com/a/29655301/763705

        """Set up class."""
        experiment = Experiment()
        experiment.accession_code = "GSE000-X"
        experiment.title = "NONONONO"
        experiment.description = "Boooooourns. Wasabi."
        experiment.technology = "RNA-SEQ"
        experiment.save()

        experiment = Experiment()
        experiment.accession_code = "GSE123-X"
        experiment.title = "Hey Ho Let's Go"
        experiment.description = (
            "This is a very exciting test experiment. Faygo soda. Blah blah blah."
        )
        experiment.technology = "MICROARRAY"
        experiment.num_processed_samples = 1  # added below
        experiment.num_total_samples = 1
        experiment.num_downloadable_samples = 1
        experiment.save()

        experiment_annotation = ExperimentAnnotation()
        experiment_annotation.data = {"hello": "world", "123": 456}
        experiment_annotation.experiment = experiment
        experiment_annotation.save()

        sample = Sample()
        sample.title = "123"
        sample.accession_code = "123"
        sample.save()

        organism = Organism(
            name="AILUROPODA_MELANOLEUCA", taxonomy_id=9646, is_scientific_name=True
        )
        organism.save()

        sample = Sample()
        sample.title = "789"
        sample.accession_code = "789"
        sample.is_processed = True
        sample.organism = organism
        sample.save()

        sample_annotation = SampleAnnotation()
        sample_annotation.data = {"goodbye": "world", "789": 123}
        sample_annotation.sample = sample
        sample_annotation.save()

        original_file = OriginalFile()
        original_file.save()

        original_file_sample_association = OriginalFileSampleAssociation()
        original_file_sample_association.sample = sample
        original_file_sample_association.original_file = original_file
        original_file_sample_association.save()

        downloader_job = DownloaderJob()
        downloader_job.save()

        download_assoc = DownloaderJobOriginalFileAssociation()
        download_assoc.original_file = original_file
        download_assoc.downloader_job = downloader_job
        download_assoc.save()

        processor_job = ProcessorJob()
        processor_job.save()

        processor_assoc = ProcessorJobOriginalFileAssociation()
        processor_assoc.original_file = original_file
        processor_assoc.processor_job = processor_job
        processor_assoc.save()

        # associate the experiment with the sample
        experiment_sample_association = ExperimentSampleAssociation()
        experiment_sample_association.sample = sample
        experiment_sample_association.experiment = experiment
        experiment_sample_association.save()

        result = ComputationalResult()
        result.save()

        # and create a qn tarjet for the sample
        computational_result = ComputationalResultAnnotation()
        computational_result.result = result
        computational_result.data = {"is_qn": True, "organism_id": sample.organism.id}
        computational_result.save()

        # and associate it with the sample organism
        sample.organism.qn_target = result
        sample.organism.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        result = ComputationalResult()
        result.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        # clear default cache and reindex
        # otherwise the organisms with qn_targes will be cached.
        cache.clear()
        call_command("search_index", "--rebuild", "-f")
Exemplo n.º 28
0
def create_long_and_short_processor_jobs(downloader_job, long_files_to_process,
                                         short_files_to_process):
    """ Creates two processor jobs for the files needed for this transcriptome"""

    processor_job_long = ProcessorJob()
    processor_job_long.downloader_job = downloader_job
    processor_job_long.pipeline_applied = "TRANSCRIPTOME_INDEX_LONG"
    processor_job_long.ram_amount = 4096
    processor_job_long.save()

    for original_file in long_files_to_process:

        assoc = ProcessorJobOriginalFileAssociation()
        assoc.original_file = original_file
        assoc.processor_job = processor_job_long
        assoc.save()

    try:
        send_job(ProcessorPipeline[processor_job_long.pipeline_applied],
                 processor_job_long)
    except Exception:
        # This is fine, the foreman will requeue these later.
        logger.exception(
            "Problem with submitting a long transcriptome index job.")

    processor_job_short = ProcessorJob()
    processor_job_short.downloader_job = downloader_job
    processor_job_short.pipeline_applied = "TRANSCRIPTOME_INDEX_SHORT"
    processor_job_short.ram_amount = 4096
    processor_job_short.save()

    for original_file in short_files_to_process:

        assoc = ProcessorJobOriginalFileAssociation()
        assoc.original_file = original_file
        assoc.processor_job = processor_job_short
        assoc.save()

    try:
        send_job(ProcessorPipeline[processor_job_short.pipeline_applied],
                 processor_job_short)
    except Exception:
        # This is fine, the foreman will requeue these later.
        logger.exception(
            "Problem with submitting a long transcriptome index job.")
Exemplo n.º 29
0
    def test_convert_simple_pcl(self):
        """ """

        job = ProcessorJob()
        job.pipeline_applied = "NO_OP"
        job.save()

        # ID_REF, VALUE
        og_file = OriginalFile()
        og_file.source_filename = "https://www.ebi.ac.uk/arrayexpress/experiments/E-GEOD-51013/"
        og_file.filename = "GSM1234847_sample_table.txt"
        og_file.absolute_file_path = "/home/user/data_store/raw/TEST/NO_OP/GSM1234847_sample_table.txt"
        og_file.is_downloaded = True
        og_file.save()

        sample = Sample()
        sample.accession_code = "GSM1234847"
        sample.title = "GSM1234847"
        sample.platform_accession_code = 'A-AFFY-38'
        sample.save()

        assoc = OriginalFileSampleAssociation()
        assoc.original_file = og_file
        assoc.sample = sample
        assoc.save()

        assoc1 = ProcessorJobOriginalFileAssociation()
        assoc1.original_file = og_file
        assoc1.processor_job = job
        assoc1.save()

        final_context = no_op.no_op_processor(job.pk)

        # No header - ex
        # AFFX-BioB-3_at  0.74218756
        og_file = OriginalFile()
        og_file.source_filename = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE10nnn/GSE10188/miniml/GSE10188_family.xml.tgz"
        og_file.filename = "GSM269747-tbl-1.txt"
        og_file.absolute_file_path = "/home/user/data_store/raw/TEST/NO_OP/GSM269747-tbl-1.txt"
        og_file.is_downloaded = True
        og_file.save()

        sample = Sample()
        sample.accession_code = "GSM269747"
        sample.title = "GSM269747"
        sample.platform_accession_code = 'GPL1319'
        sample.save()

        assoc = OriginalFileSampleAssociation()
        assoc.original_file = og_file
        assoc.sample = sample
        assoc.save()

        job = ProcessorJob()
        job.pipeline_applied = "NO_OP"
        job.save()

        assoc1 = ProcessorJobOriginalFileAssociation()
        assoc1.original_file = og_file
        assoc1.processor_job = job
        assoc1.save()

        final_context = no_op.no_op_processor(job.pk)
        self.assertTrue(final_context['success'])
        self.assertTrue(os.path.exists(final_context['output_file_path']))
        self.assertEqual(os.path.getsize(final_context['output_file_path']),
                         346535)
Exemplo n.º 30
0
def create_long_and_short_processor_jobs(files_to_process):
    """ Creates two processor jobs for the files needed for this transcriptome"""

    processor_job_long = ProcessorJob()
    processor_job_long.pipeline_applied = "TRANSCRIPTOME_INDEX_LONG"
    processor_job_long.ram_amount = 4096
    processor_job_long.save()

    for original_file in files_to_process:

        assoc = ProcessorJobOriginalFileAssociation()
        assoc.original_file = original_file
        assoc.processor_job = processor_job_long
        assoc.save()

    try:
        send_job(ProcessorPipeline[processor_job_long.pipeline_applied],
                 processor_job_long)
    except Exception:
        # This is fine, the foreman will requeue these later.
        pass

    processor_job_short = ProcessorJob()
    processor_job_short.pipeline_applied = "TRANSCRIPTOME_INDEX_SHORT"
    processor_job_short.ram_amount = 4096
    processor_job_short.save()

    for original_file in files_to_process:

        assoc = ProcessorJobOriginalFileAssociation()
        assoc.original_file = original_file
        assoc.processor_job = processor_job_short
        assoc.save()

    try:
        send_job(ProcessorPipeline[processor_job_short.pipeline_applied],
                 processor_job_short)
    except Exception:
        # This is fine, the foreman will requeue these later.
        pass