def test_run_pipeline__samples_not_ready__fastq2(self):
        """Tests that the pipeline raises an AssertionError if samples aren't
        ready, fastq2.
        """
        fastq_dataset = self.experiment_sample.dataset_set.filter(
            type=Dataset.TYPE.FASTQ2)[0]
        fastq_dataset.status = Dataset.STATUS.QUEUED_TO_COPY
        fastq_dataset.save()

        sample_list = [self.experiment_sample]

        with self.assertRaises(AssertionError):
            run_pipeline('name_placeholder', self.reference_genome, sample_list)
Пример #2
0
 def test_run_pipeline__bad_alignment(self):
     """Alignment of bad reads. Might happen if user tries to align wrong
     reads to wrong reference genome.
     """
     ref_genome = import_reference_genome_from_local_file(
         self.project, 'concat_mg1655_partials',
         FullVCFTestSet.TEST_CONCAT_GENBANK, 'genbank')
     sample_list = [self.experiment_sample]
     # NOTE: Ideally there would be a better way to test this.
     # In general, we need to figure out how to better communicate the reason
     # for a failed alignment to the user.
     with self.assertRaises(Exception):
         run_pipeline('name_placeholder', ref_genome, sample_list)
Пример #3
0
 def test_run_pipeline__bad_alignment(self):
     """Alignment of bad reads. Might happen if user tries to align wrong
     reads to wrong reference genome.
     """
     ref_genome = import_reference_genome_from_local_file(
             self.project, 'concat_mg1655_partials',
             FullVCFTestSet.TEST_CONCAT_GENBANK, 'genbank')
     sample_list = [self.experiment_sample]
     # NOTE: Ideally there would be a better way to test this.
     # In general, we need to figure out how to better communicate the reason
     # for a failed alignment to the user.
     with self.assertRaises(Exception):
         run_pipeline('name_placeholder', ref_genome, sample_list)
Пример #4
0
    def test_run_pipeline__samples_not_ready__fastq2(self):
        """Tests that the pipeline raises an AssertionError if samples aren't
        ready, fastq2.
        """
        fastq_dataset = self.experiment_sample.dataset_set.filter(
            type=Dataset.TYPE.FASTQ2)[0]
        fastq_dataset.status = Dataset.STATUS.QUEUED_TO_COPY
        fastq_dataset.save()

        sample_list = [self.experiment_sample]

        with self.assertRaises(AssertionError):
            run_pipeline('name_placeholder', self.reference_genome, sample_list)
Пример #5
0
    def test_run_pipeline__genbank_from_ncbi_with_spaces_in_label(self):
        """Tests the pipeline where the genome is imported from NCBI with
        spaces in the name.
        """
        if not internet_on():
            return
        MG1655_ACCESSION = 'NC_000913.3'
        MG1655_LABEL = 'mg1655 look a space'
        ref_genome = import_reference_genome_from_ncbi(self.project,
                                                       MG1655_LABEL,
                                                       MG1655_ACCESSION,
                                                       'genbank')
        sample_list = [self.experiment_sample]

        alignment_group_obj, async_result = run_pipeline(
            'name_placeholder', ref_genome, sample_list)

        # Block until pipeline finishes.
        while not async_result.ready():
            time.sleep(1)
        if async_result.status == 'FAILURE':
            self.fail('Async task failed.')

        alignment_group_obj = AlignmentGroup.objects.get(
            id=alignment_group_obj.id)

        self.assertEqual(
            1, len(alignment_group_obj.experimentsampletoalignment_set.all()))
        self.assertEqual(AlignmentGroup.STATUS.COMPLETED,
                         alignment_group_obj.status)
Пример #6
0
    def test_run_pipeline(self):
        """Tests running the full pipeline.
        """
        sample_list = [self.experiment_sample]

        alignment_group_obj, async_result = run_pipeline(
            'name_placeholder', self.reference_genome, sample_list)

        # Block until pipeline finishes.
        while not async_result.ready():
            time.sleep(1)
        if async_result.status == 'FAILURE':
            self.fail('Async task failed.')

        # Refresh the object.
        alignment_group_obj = AlignmentGroup.objects.get(
            id=alignment_group_obj.id)

        # Verify the AlignmentGroup object is created.
        self.assertEqual(
            1, len(alignment_group_obj.experimentsampletoalignment_set.all()))
        self.assertEqual(AlignmentGroup.STATUS.COMPLETED,
                         alignment_group_obj.status)

        # Make sure the initial JBrowse config has been created.
        jbrowse_dir = self.reference_genome.get_jbrowse_directory_path()
        self.assertTrue(os.path.exists(jbrowse_dir))
        self.assertTrue(
            os.path.exists(os.path.join(jbrowse_dir, 'indiv_tracks')))
Пример #7
0
    def test_run_pipeline(self):
        """End-to-end test of pipeline. Fails if any errors.
        """
        # Create an extra sample that will not be aligned but has parent-child
        # relationship with the sample that is aligned. This would catch the
        # bug reported in https://github.com/churchlab/millstone/issues/561.
        unused_es = ExperimentSample.objects.create(project=self.project,
                                                    label='unused sample')
        self.experiment_sample.add_child(unused_es)

        sample_list = [self.experiment_sample]
        result = run_pipeline('name_placeholder', self.reference_genome,
                              sample_list)
        alignment_group = result[0]
        alignment_async_result = result[1]
        variant_calling_async_result = result[2]
        alignment_async_result.get()
        variant_calling_async_result.get()
        alignment_group = AlignmentGroup.objects.get(uid=alignment_group.uid)
        self.assertEqual(AlignmentGroup.STATUS.COMPLETED,
                         alignment_group.status)

        # Make sure some expected variants are found.
        variants = Variant.objects.filter(
            reference_genome=self.reference_genome)
        self.assertTrue(len(variants))
        v_1834 = Variant.objects.get(position=1834)
        v_1834_vccd = v_1834.variantcallercommondata_set.all()[0]
        v_1834_ve = v_1834_vccd.variantevidence_set.all()[0]
        self.assertFalse(v_1834_ve.data.get('IS_SV', False))
Пример #8
0
    def test_run_pipeline__snps_with_effect__no_svs(self):
        """Tests pipeline with SNPs with effect, but no SVs called.
        """
        ref_genome = import_reference_genome_from_local_file(
            self.project, 'mg1655_tolC_through_zupT',
            FullVCFTestSet.TEST_GENBANK, 'genbank')

        sample_obj = ExperimentSample.objects.create(project=self.project,
                                                     label='Sample %d' % 0)

        # Add raw reads to each sample.
        copy_and_add_dataset_source(sample_obj, Dataset.TYPE.FASTQ1,
                                    Dataset.TYPE.FASTQ1,
                                    FullVCFTestSet.FASTQ1[0])
        copy_and_add_dataset_source(sample_obj, Dataset.TYPE.FASTQ2,
                                    Dataset.TYPE.FASTQ2,
                                    FullVCFTestSet.FASTQ2[0])

        result = run_pipeline('test_align', ref_genome, [sample_obj])

        alignment_group = result[0]
        alignment_async_result = result[1]
        variant_calling_async_result = result[2]
        alignment_async_result.get()
        variant_calling_async_result.get()
        alignment_group = AlignmentGroup.objects.get(uid=alignment_group.uid)
        self.assertEqual(AlignmentGroup.STATUS.COMPLETED,
                         alignment_group.status)

        # Check that SnpEff worked.
        v_205 = Variant.objects.get(
            reference_genome=alignment_group.reference_genome, position=205)
        v_205_va = v_205.variantalternate_set.all()[0]
        self.assertEqual('tolC', v_205_va.data['INFO_EFF_GENE'])
Пример #9
0
    def test_run_pipeline(self):
        """End-to-end test of pipeline. Fails if any errors.
        """
        # Create an extra sample that will not be aligned but has parent-child
        # relationship with the sample that is aligned. This would catch the
        # bug reported in https://github.com/churchlab/millstone/issues/561.
        unused_es = ExperimentSample.objects.create(
                project=self.project, label='unused sample')
        self.experiment_sample.add_child(unused_es)

        sample_list = [self.experiment_sample]
        result = run_pipeline(
                'name_placeholder', self.reference_genome, sample_list)
        alignment_group = result[0]
        alignment_async_result = result[1]
        variant_calling_async_result = result[2]
        alignment_async_result.get()
        variant_calling_async_result.get()
        alignment_group = AlignmentGroup.objects.get(uid=alignment_group.uid)
        self.assertEqual(AlignmentGroup.STATUS.COMPLETED,
                alignment_group.status)

        # Make sure some expected variants are found.
        variants = Variant.objects.filter(
                reference_genome=self.reference_genome)
        self.assertTrue(len(variants))
        v_1834 = Variant.objects.get(position=1834)
        v_1834_vccd = v_1834.variantcallercommondata_set.all()[0]
        v_1834_ve = v_1834_vccd.variantevidence_set.all()[0]
        self.assertFalse(v_1834_ve.data.get('IS_SV', False))
 def test_run_pipeline(self):
     """End-to-end test of pipeline. Fails if any errors.
     """
     sample_list = [self.experiment_sample]
     alignment_group, async_result = run_pipeline('name_placeholder',
             self.reference_genome, sample_list)
     async_result.wait()
     alignment_group = AlignmentGroup.objects.get(uid=alignment_group.uid)
     self.assertEqual(AlignmentGroup.STATUS.COMPLETED,
             alignment_group.status)
Пример #11
0
    def test_run_alignment_with_spaces_in_genbank_filename(self):
        project = self.common_entities['project']
        ref_genome_label = 'dirty_upload'
        request = HttpRequest()
        request.POST = {
            'projectUid': project.uid,
            'refGenomeLabel': ref_genome_label,
            'importFileFormat': 'genbank'
        }
        request.method = 'POST'
        request.user = self.common_entities['user']
        authenticate(username=TEST_USERNAME, password=TEST_PASSWORD)
        self.assertTrue(request.user.is_authenticated())

        request.FILES['refGenomeFile'] = UploadedFile(
            file=open(TEST_GENBANK), name='dirty_genbank (spaces).gb')

        response = create_ref_genome_from_browser_upload(request)
        self.assertEqual(STATUS_CODE__SUCCESS, response.status_code)
        self.assertFalse(json.loads(response.content).get('error', False))

        # Get reference genome
        ref_genome = ReferenceGenome.objects.get(project=project,
                                                 label=ref_genome_label)

        # Create sample model
        sample = ExperimentSample.objects.create(project=project,
                                                 label='test_sample')

        # Add fastq datasets to sample
        add_dataset_to_entity(sample,
                              Dataset.TYPE.FASTQ1,
                              Dataset.TYPE.FASTQ1,
                              filesystem_location=TEST_DIRTY_FQ_1)

        # Add fastq datasets to sample
        add_dataset_to_entity(sample,
                              Dataset.TYPE.FASTQ2,
                              Dataset.TYPE.FASTQ2,
                              filesystem_location=TEST_DIRTY_FQ_2)

        # Run alignment of sample to reference
        alignment_group_label = 'test_alignment'
        sample_list = [sample]

        result = run_pipeline(alignment_group_label, ref_genome, sample_list)

        alignment_group = result[0]
        alignment_async_result = result[1]
        variant_calling_async_result = result[2]
        alignment_async_result.get()
        variant_calling_async_result.get()
        alignment_group = AlignmentGroup.objects.get(uid=alignment_group.uid)
        self.assertEqual(AlignmentGroup.STATUS.COMPLETED,
                         alignment_group.status)
def sv_testing_bootstrap(project):
    sv_testing_dir = os.path.join(GD_ROOT, 'test_data', 'sv_testing', 'all_svs')
    fasta = os.path.join(sv_testing_dir, 'ref.fa')
    fq1 = os.path.join(sv_testing_dir, 'simLibrary.1.fq')
    fq2 = os.path.join(sv_testing_dir, 'simLibrary.2.fq')

    ref_genome = import_reference_genome_from_local_file(
            project, 'ref', fasta, 'fasta')

    sample = ExperimentSample.objects.create(
            project=project,
            label='simLibrary',
    )
    copy_and_add_dataset_source(sample, Dataset.TYPE.FASTQ1,
            Dataset.TYPE.FASTQ1, fq1)
    copy_and_add_dataset_source(sample, Dataset.TYPE.FASTQ2,
            Dataset.TYPE.FASTQ2, fq2)

    if '--sv' in sys.argv:  # using --sv argument runs pipeline for SV project
        run_pipeline('sample_alignment_ref', ref_genome, [sample])
Пример #13
0
def sv_testing_bootstrap(project):
    sv_testing_dir = os.path.join(GD_ROOT, 'test_data', 'sv_testing',
                                  'all_svs')
    fasta = os.path.join(sv_testing_dir, 'ref.fa')
    fq1 = os.path.join(sv_testing_dir, 'simLibrary.1.fq')
    fq2 = os.path.join(sv_testing_dir, 'simLibrary.2.fq')

    ref_genome = import_reference_genome_from_local_file(
        project, 'ref', fasta, 'fasta')

    sample = ExperimentSample.objects.create(
        project=project,
        label='simLibrary',
    )
    copy_and_add_dataset_source(sample, Dataset.TYPE.FASTQ1,
                                Dataset.TYPE.FASTQ1, fq1)
    copy_and_add_dataset_source(sample, Dataset.TYPE.FASTQ2,
                                Dataset.TYPE.FASTQ2, fq2)

    if '--sv' in sys.argv:  # using --sv argument runs pipeline for SV project
        run_pipeline('sample_alignment_ref', ref_genome, [sample])
Пример #14
0
    def _perform_assembly(self, data_dict):

        ref_fasta = data_dict['ref_fasta']
        fq_1 = data_dict['fq_1']
        fq_2 = data_dict['fq_2']

        # Import reference genome
        ref_genome = import_reference_genome_from_local_file(
                self.project, 'test_ref',
                ref_fasta, 'fasta', move=False)

        # Create sample model
        sample = ExperimentSample.objects.create(
                project=self.project,
                label='test_sample')

        # Add fastq datasets to sample
        add_dataset_to_entity(
                sample,
                Dataset.TYPE.FASTQ1,
                Dataset.TYPE.FASTQ1,
                filesystem_location=fq_1)
        add_dataset_to_entity(
                sample,
                Dataset.TYPE.FASTQ2,
                Dataset.TYPE.FASTQ2,
                filesystem_location=fq_2)

        # Run alignment of sample to reference
        alignment_group_label = 'test_alignment'
        sample_list = [sample]
        alignment_group, _, _ = run_pipeline(
                alignment_group_label, ref_genome, sample_list,
                perform_variant_calling=False, alignment_options={})

        # Get resulting ExperimentSampleToAlignment
        sample_align = ExperimentSampleToAlignment.objects.get(
                alignment_group=alignment_group,
                experiment_sample=sample)

        # Run pipeline and wait on result
        async_result = run_de_novo_assembly_pipeline([sample_align])
        async_result.get()

        # Retrieve contigs
        contigs = Contig.objects.filter(
                parent_reference_genome=ref_genome,
                experiment_sample_to_alignment=sample_align)

        return contigs
Пример #15
0
 def test_run_pipeline__multiple_samples(self):
     """End-to-end test of pipeline. Fails if any errors.
     """
     sample_list = [self.experiment_sample, self.experiment_sample_2]
     result = run_pipeline(
             'name_placeholder', self.reference_genome, sample_list)
     alignment_group = result[0]
     alignment_async_result = result[1]
     variant_calling_async_result = result[2]
     alignment_async_result.get()
     variant_calling_async_result.get()
     alignment_group = AlignmentGroup.objects.get(uid=alignment_group.uid)
     self.assertEqual(AlignmentGroup.STATUS.COMPLETED,
             alignment_group.status)
Пример #16
0
 def test_run_pipeline__multiple_samples(self):
     """End-to-end test of pipeline. Fails if any errors.
     """
     sample_list = [self.experiment_sample, self.experiment_sample_2]
     result = run_pipeline('name_placeholder', self.reference_genome,
                           sample_list)
     alignment_group = result[0]
     alignment_async_result = result[1]
     variant_calling_async_result = result[2]
     alignment_async_result.get()
     variant_calling_async_result.get()
     alignment_group = AlignmentGroup.objects.get(uid=alignment_group.uid)
     self.assertEqual(AlignmentGroup.STATUS.COMPLETED,
                      alignment_group.status)
Пример #17
0
    def test_run_pipeline__multiple_chromosomes(self):
        """Makes sure variant calling works when there are multiple chromosomes
        on a single reference genome.
        """
        ref_genome = import_reference_genome_from_local_file(
                self.project, 'concat_mg1655_partials',
                FullVCFTestSet.TEST_CONCAT_GENBANK, 'genbank')


        sample_obj = ExperimentSample.objects.create(
                project=self.project,
                label='Sample 0')

        # Add raw reads to each sample.
        copy_and_add_dataset_source(sample_obj,
                Dataset.TYPE.FASTQ1,
                Dataset.TYPE.FASTQ1,
                FullVCFTestSet.FASTQ1[0])
        copy_and_add_dataset_source(sample_obj,
                Dataset.TYPE.FASTQ2,
                Dataset.TYPE.FASTQ2,
                FullVCFTestSet.FASTQ2[0])

        sample_list = [sample_obj]

        result = run_pipeline(
                'name_placeholder', ref_genome, sample_list)
        alignment_group = result[0]
        alignment_async_result = result[1]
        variant_calling_async_result = result[2]
        alignment_async_result.get()
        variant_calling_async_result.get()
        alignment_group = AlignmentGroup.objects.get(uid=alignment_group.uid)
        self.assertEqual(AlignmentGroup.STATUS.COMPLETED,
                alignment_group.status)

        # Validate that all variants calld.
        # TODO: Add Chromosome checking.

        v_515 = Variant.objects.get(
                reference_genome=alignment_group.reference_genome, position=515)
        v_515_va = v_515.variantalternate_set.all()[0]
        self.assertEqual('ygiB', v_515_va.data['INFO_EFF_GENE'])

        v_205 = Variant.objects.get(
                reference_genome=alignment_group.reference_genome, position=205)
        v_205_va = v_205.variantalternate_set.all()[0]
        self.assertEqual('tolC', v_205_va.data['INFO_EFF_GENE'])
    def _align_and_assemble(self, ref_genome, sample_list):
        # Run alignment of sample to reference
        alignment_group_label = 'test_alignment'
        alignment_group, _, _ = run_pipeline(
                alignment_group_label, ref_genome, sample_list,
                perform_variant_calling=False, alignment_options={})

        # Get resulting ExperimentSampleToAlignment
        sample_align_list = ExperimentSampleToAlignment.objects.filter(
                alignment_group=alignment_group,
                experiment_sample__in=sample_list)

        # Run pipeline and wait on result
        run_de_novo_assembly_pipeline(sample_align_list)

        return alignment_group
Пример #19
0
    def _align_and_assemble(self, ref_genome, sample_list):
        # Run alignment of sample to reference
        alignment_group_label = 'test_alignment'
        alignment_group, _, _ = run_pipeline(
                alignment_group_label, ref_genome, sample_list,
                perform_variant_calling=False, alignment_options={})

        # Get resulting ExperimentSampleToAlignment
        sample_align_list = ExperimentSampleToAlignment.objects.filter(
                alignment_group=alignment_group,
                experiment_sample__in=sample_list)

        # Run pipeline and wait on result
        async_result = run_de_novo_assembly_pipeline(sample_align_list)
        async_result.get()

        return alignment_group
Пример #20
0
    def test_run_pipeline__snps_with_effect__no_svs(self):
        """Tests pipeline with SNPs with effect, but no SVs called.
        """
        ref_genome = import_reference_genome_from_local_file(
                self.project, 'mg1655_tolC_through_zupT',
                FullVCFTestSet.TEST_GENBANK, 'genbank')

        sample_obj = ExperimentSample.objects.create(
                project=self.project,
                label='Sample %d' % 0)

        # Add raw reads to each sample.
        copy_and_add_dataset_source(sample_obj,
                Dataset.TYPE.FASTQ1,
                Dataset.TYPE.FASTQ1,
                FullVCFTestSet.FASTQ1[0])
        copy_and_add_dataset_source(sample_obj,
                Dataset.TYPE.FASTQ2,
                Dataset.TYPE.FASTQ2,
                FullVCFTestSet.FASTQ2[0])

        result = run_pipeline(
            'test_align', ref_genome, [sample_obj])

        alignment_group = result[0]
        alignment_async_result = result[1]
        variant_calling_async_result = result[2]
        alignment_async_result.get()
        variant_calling_async_result.get()
        alignment_group = AlignmentGroup.objects.get(uid=alignment_group.uid)
        self.assertEqual(AlignmentGroup.STATUS.COMPLETED,
                alignment_group.status)

        # Check that SnpEff worked.
        v_205 = Variant.objects.get(
                reference_genome=alignment_group.reference_genome, position=205)
        v_205_va = v_205.variantalternate_set.all()[0]
        self.assertEqual('tolC', v_205_va.data['INFO_EFF_GENE'])
Пример #21
0
def bootstrap_fake_data():
    """Fill the database with fake data.
    """
    user = get_or_create_user()

    ### Create some projects
    (test_project,
     project_created) = Project.objects.get_or_create(title=TEST_PROJECT_NAME,
                                                      owner=user.get_profile())
    (test_project_2,
     project_created) = Project.objects.get_or_create(title=SV_PROJECT_NAME,
                                                      owner=user.get_profile())

    ### Create some reference genomes
    ref_genome_1 = import_reference_genome_from_local_file(
        test_project, REF_GENOME_1_LABEL, TEST_FASTA, 'fasta')

    ref_genome_2 = import_reference_genome_from_local_file(
        test_project, REF_GENOME_2_LABEL, TEST_FASTA, 'fasta')

    ref_genome_3 = import_reference_genome_from_local_file(
        test_project, 'test_genome', TEST_FASTA, 'fasta')

    ### Create some saved queries.
    for saved_query_text in CUSTOM_SAVED_QUERY_LIST:
        SavedVariantFilterQuery.objects.get_or_create(owner=user.get_profile(),
                                                      text=saved_query_text)

    ### Create some ExperimentSamples.

    # Create some samples without backing data just to explore the UI.
    ExperimentSample.objects.create(project=test_project,
                                    label='C321D_MiSeq',
                                    data={'SAMPLE_WELL': 'A01'})

    ExperimentSample.objects.create(project=test_project,
                                    label='C321D Fixed 01',
                                    data={'SAMPLE_WELL': 'A02'})

    ExperimentSample.objects.create(project=test_project,
                                    label='C321D Fixed 02',
                                    data={'SAMPLE_WELL': 'A03'})

    # Create some samples with backing data.
    (sample_1,
     created) = ExperimentSample.objects.get_or_create(project=test_project,
                                                       label=SAMPLE_1_LABEL)
    # Add datasets to the samples.
    if not sample_1.dataset_set.filter(type=Dataset.TYPE.FASTQ1):
        copy_and_add_dataset_source(sample_1, Dataset.TYPE.FASTQ1,
                                    Dataset.TYPE.FASTQ1, TEST_FASTQ1)
    if not sample_1.dataset_set.filter(type=Dataset.TYPE.FASTQ2):
        copy_and_add_dataset_source(sample_1, Dataset.TYPE.FASTQ2,
                                    Dataset.TYPE.FASTQ2, TEST_FASTQ2)

    # Create sample backed by g-zipped data.
    gz_backed_sample = ExperimentSample.objects.create(
        project=test_project, label='sample backed by gz data')
    gz_fastq1_dataset = copy_and_add_dataset_source(gz_backed_sample,
                                                    Dataset.TYPE.FASTQ1,
                                                    Dataset.TYPE.FASTQ1,
                                                    TEST_FASTQ_GZ_1)
    gz_fastq2_dataset = copy_and_add_dataset_source(gz_backed_sample,
                                                    Dataset.TYPE.FASTQ1,
                                                    Dataset.TYPE.FASTQ2,
                                                    TEST_FASTQ_GZ_2)
    run_fastqc_on_sample_fastq(gz_backed_sample, gz_fastq1_dataset)
    run_fastqc_on_sample_fastq(gz_backed_sample, gz_fastq2_dataset, rev=True)

    ### Create an alignment.
    alignment_group_1 = AlignmentGroup.objects.create(
        label='Alignment 1',
        reference_genome=ref_genome_3,
        aligner=AlignmentGroup.ALIGNER.BWA)
    # Link it to a sample.
    sample_alignment = ExperimentSampleToAlignment.objects.create(
        alignment_group=alignment_group_1, experiment_sample=sample_1)
    ### Add alignment data. NOTE: Stored in sample model dir.
    # NOTE: This is a bit convoluted. Perhaps it would be better to store alignments
    # in the ExperimentSampleToAlignment directory.
    copy_dest = copy_dataset_to_entity_data_dir(sample_1, TEST_BAM)
    copy_dataset_to_entity_data_dir(sample_1, TEST_BAM_INDEX)
    add_dataset_to_entity(sample_alignment, Dataset.TYPE.BWA_ALIGN,
                          Dataset.TYPE.BWA_ALIGN, copy_dest)

    # Create fake variants.
    create_fake_variants_and_variant_sets(ref_genome_1)

    #############################
    # Full VCF Testing (annotated for snpeff, variant filtering, etc)
    #############################

    # Create a new reference genome and samples using full_vcf_test_set
    full_vcf_reference_genome = import_reference_genome_from_local_file(
        test_project, 'mg1655_tolC_through_zupT', FullVCFTestSet.TEST_GENBANK,
        'genbank')

    # Create all samples.
    parent_obj = None
    full_vcf_samples = []
    for i in range(FullVCFTestSet.NUM_SAMPLES):
        sample_obj = ExperimentSample.objects.create(project=test_project,
                                                     label='Sample %d' % i)

        sample_obj.data['SAMPLE_WELL'] = 'A0%d' % (i + 1)

        if i == 0:
            parent_obj = sample_obj
        if i > 0:
            sample_obj.data['SAMPLE_PARENTS'] = parent_obj.label
            parent_obj.add_child(sample_obj)
            parent_obj.save()

        sample_obj.save()

        # Add raw reads to each sample.
        fastq1_dataset = copy_and_add_dataset_source(sample_obj,
                                                     Dataset.TYPE.FASTQ1,
                                                     Dataset.TYPE.FASTQ1,
                                                     FullVCFTestSet.FASTQ1[i])
        fastq2_dataset = copy_and_add_dataset_source(sample_obj,
                                                     Dataset.TYPE.FASTQ2,
                                                     Dataset.TYPE.FASTQ2,
                                                     FullVCFTestSet.FASTQ2[i])

        # Run FASTQC on sample reads.
        run_fastqc_on_sample_fastq(sample_obj, fastq1_dataset)
        run_fastqc_on_sample_fastq(sample_obj, fastq2_dataset, rev=True)

        full_vcf_samples.append(sample_obj)

    # Run the alignment. Return the alignment group created, indexed by the
    # reference genome's uid.
    run_pipeline('test_align', full_vcf_reference_genome, full_vcf_samples)

    import_variant_set_from_vcf(full_vcf_reference_genome, 'Designed',
                                FullVCFTestSet.TEST_DESIGNED_SNPS)

    def _create_region_intervals(region, interval_tuple_list):
        """Helper method to create RegionIntervals for a Region.

        Args:
            region: Region Model object.
            interval_tuple_list: List of tuples of intervals to create.
        """
        for interval in interval_tuple_list:
            RegionInterval.objects.create(region=region,
                                          start=interval[0],
                                          end=interval[1])

    # Create some fake regions.
    # TODO: Should not be much harder to replace this with real regions.
    region_1 = Region.objects.create(
        reference_genome=full_vcf_reference_genome,
        label='region_1',
        type=Region.TYPE.POOR_MAPPING_QUALITY)
    _create_region_intervals(region_1, [(1, 150), (300, 400), (500, 900)])

    region_2 = Region.objects.create(
        reference_genome=full_vcf_reference_genome,
        label='region_2',
        type=Region.TYPE.POOR_MAPPING_QUALITY)
    _create_region_intervals(region_2, [(1000, 1500)])

    region_3 = Region.objects.create(
        reference_genome=full_vcf_reference_genome,
        label='region_3',
        type=Region.TYPE.POOR_MAPPING_QUALITY)
    _create_region_intervals(region_3, [(1800, 1900), (2150, 2300)])

    # And some GENE regions.

    gene_A = Region.objects.create(reference_genome=full_vcf_reference_genome,
                                   label='geneA',
                                   type=Region.TYPE.GENE)
    _create_region_intervals(gene_A, [(2000, 2400)])

    gene_B = Region.objects.create(reference_genome=full_vcf_reference_genome,
                                   label='geneB',
                                   type=Region.TYPE.GENE)
    _create_region_intervals(gene_B, [(4800, 5200)])

    gene_C = Region.objects.create(reference_genome=full_vcf_reference_genome,
                                   label='geneC',
                                   type=Region.TYPE.GENE)
    _create_region_intervals(gene_C, [(1, 500)])

    # Bootstrap test_project_2 with SV stuff
    sv_testing_bootstrap(test_project_2)
Пример #22
0
    def test_pipeline_and_svs(self):
        alignment_group_obj, async_result = run_pipeline(
                'name', self.reference_genome, [self.experiment_sample])

        # Block until pipeline finishes.
        while not async_result.ready():
            time.sleep(1)
        if async_result.status == 'FAILURE':
            self.fail('Async task failed.')

        # Get fresh copy of AlignmentGroup object since it was processed
        # different thread.
        alignment_group_obj = AlignmentGroup.objects.get(
                id=alignment_group_obj.id)

        self.assertEqual(1,
                len(alignment_group_obj.experimentsampletoalignment_set.all()))
        self.assertEqual(AlignmentGroup.STATUS.COMPLETED,
                alignment_group_obj.status)

        # Make sure the initial JBrowse config has been created.
        jbrowse_dir = self.reference_genome.get_jbrowse_directory_path()
        self.assertTrue(os.path.exists(jbrowse_dir))
        self.assertTrue(os.path.exists(os.path.join(jbrowse_dir,
                'indiv_tracks')))

        vcf_files = {}

        vcf_types = [VARIANT_TOOL_PARAMS_MAP[tool]['dataset_type']
                for tool in settings.ENABLED_VARIANT_CALLERS]

        for vcf_type in vcf_types:
            vcf_dataset = get_dataset_with_type(alignment_group_obj, vcf_type)
            self.assertIsNotNone(vcf_dataset,
                    msg='Missing vcf_dataset for {vcf_type}.'.format(
                            vcf_type=vcf_type))
            vcf_location = vcf_dataset.get_absolute_location()
            self.assertTrue(os.path.exists(vcf_location))
            vcf_files[vcf_type] = vcf_location

        # Check actual variants, with this helper vcf-parser function
        def get_variants(vcf_type):
            variants = []
            with open(vcf_files[vcf_type]) as fh:
                vcf_reader = vcf.Reader(fh)
                for record_idx, record in enumerate(vcf_reader):

                    raw_data_dict = extract_raw_data_dict(record)

                    # we should expect exactly 1 alternate
                    assert len(raw_data_dict['INFO_SVLEN']) == 1, (
                        'length of INFO_SVLEN > 1: {svlen}'.format(
                                svlen=raw_data_dict['INFO_SVLEN']))
                    assert len(raw_data_dict['INFO_SVTYPE']) == 1, (
                        'length of INFO_SVLEN > 1: {svtype}'.format(
                                svtype=raw_data_dict['INFO_SVTYPE']))

                    variant_type = str(raw_data_dict.get('INFO_SVTYPE',
                            raw_data_dict.get('TYPE'))[0])
                    pos = int(raw_data_dict.get('POS'))
                    length = int(raw_data_dict.get('INFO_SVLEN')[0])
                    variants.append({
                        'type': variant_type,
                        'pos': pos,
                        'length': length
                        })
            return variants

        lumpy_variants = get_variants(Dataset.TYPE.VCF_LUMPY)

        # Helper function for checking a specific variant type
        def verify_variant_type(variants, variant_type, pos, length):
            for variant in variants:
                # Check variant against following gauntlet.
                if variant['type'] != variant_type:
                    continue # Fail, incorrect type.
                if abs(abs(variant['pos']) - pos) >= 100:
                    continue # Fail, incorrect position.
                if (length != -1 and
                        abs(abs(variant['length']) - length) >= 100):
                    continue # Fail, incorrect length.
                # Success, variant made it through gauntlet.
                return

            # If we got here, no matches were found, fail.
            self.fail('No %s position %s found' % (variant_type, pos))

        verify_variant_type(lumpy_variants, 'DEL', 10000, 1000)
Пример #23
0
    def test_run_alignment_with_spaces_in_genbank_filename(self):
        project = self.common_entities['project']
        ref_genome_label = 'dirty_upload'
        request = HttpRequest()
        request.POST = {
            'projectUid': project.uid,
            'refGenomeLabel': ref_genome_label,
            'importFileFormat': 'genbank'
        }
        request.method = 'POST'
        request.user = self.common_entities['user']
        authenticate(username=TEST_USERNAME, password=TEST_PASSWORD)
        self.assertTrue(request.user.is_authenticated())

        request.FILES['refGenomeFile'] = UploadedFile(
                file=open(TEST_GENBANK),
                name='dirty_genbank (spaces).gb')

        response = create_ref_genome_from_browser_upload(request)
        self.assertEqual(STATUS_CODE__SUCCESS, response.status_code)
        self.assertFalse(json.loads(response.content).get('error', False))

        # Get reference genome
        ref_genome = ReferenceGenome.objects.get(
                project=project,
                label=ref_genome_label)

        # Create sample model
        sample = ExperimentSample.objects.create(
                project=project,
                label='test_sample')

        # Add fastq datasets to sample
        add_dataset_to_entity(
                sample,
                Dataset.TYPE.FASTQ1,
                Dataset.TYPE.FASTQ1,
                filesystem_location=TEST_DIRTY_FQ_1)

        # Add fastq datasets to sample
        add_dataset_to_entity(
                sample,
                Dataset.TYPE.FASTQ2,
                Dataset.TYPE.FASTQ2,
                filesystem_location=TEST_DIRTY_FQ_2)

        # Run alignment of sample to reference
        alignment_group_label = 'test_alignment'
        sample_list = [sample]

        result = run_pipeline(
                alignment_group_label, ref_genome, sample_list)

        alignment_group = result[0]
        alignment_async_result = result[1]
        variant_calling_async_result = result[2]
        alignment_async_result.get()
        variant_calling_async_result.get()
        alignment_group = AlignmentGroup.objects.get(uid=alignment_group.uid)
        self.assertEqual(AlignmentGroup.STATUS.COMPLETED,
                alignment_group.status)
Пример #24
0
def _start_new_alignment(request, project):
    """Delegate function that handles logic of kicking off alignment.
    """
    # Parse the data from the request body.
    request_data = json.loads(request.body)

    # Make sure the required keys are present.
    REQUIRED_KEYS = [
            'name', 'refGenomeUidList', 'sampleUidList', 'skipHetOnly',
            'callAsHaploid']

    if not all(key in request_data for key in REQUIRED_KEYS):
        return HttpResponseBadRequest("Invalid request. Missing keys.")

    try:
        # Parse the data and look up the relevant model instances.
        alignment_group_name = request_data['name']
        assert len(alignment_group_name), "Name required."

        ref_genome_list = ReferenceGenome.objects.filter(
                project=project,
                uid__in=request_data['refGenomeUidList'])
        assert (len(ref_genome_list) ==
                len(request_data['refGenomeUidList'])), (
                        "Invalid reference genome uid(s).")
        assert len(ref_genome_list) == 1, (
            "Exactly one reference genome must be provided.")
        ref_genome = ref_genome_list[0]

        # Make sure AlignmentGroup has a unique name, because run_pipeline
        # will re-use an alignment based on label, reference genome,
        # aligner. We are currently hard-coding the aligner to BWA.
        assert AlignmentGroup.objects.filter(
                label=alignment_group_name,
                reference_genome=ref_genome).count() == 0, (
                        "Please pick unique alignment name.")

        sample_list = ExperimentSample.objects.filter(
                project=project,
                uid__in=request_data['sampleUidList'])
        assert len(sample_list) == len(request_data['sampleUidList']), (
                "Invalid expeirment sample uid(s).")
        assert len(sample_list) > 0, "At least one sample required."

        # Populate alignment options.
        alignment_options = dict()
        if request_data['skipHetOnly']:
            alignment_options['skip_het_only'] = True

        if request_data['callAsHaploid']:
            alignment_options['call_as_haploid'] = True

        # Kick off alignments.
        run_pipeline(
                alignment_group_name,
                ref_genome, sample_list,
                alignment_options=alignment_options)

        # Success. Return a redirect response.
        response_data = {
            'redirect': reverse(
                    'main.views.alignment_list_view',
                    args=(project.uid,)),
        }
    except Exception as e:
        response_data = {
            'error': str(e)
        }

    return HttpResponse(json.dumps(response_data),
            content_type='application/json')
def bootstrap_fake_data():
    """Fill the database with fake data.
    """
    user = get_or_create_user()

    ### Create some projects
    (test_project, project_created) = Project.objects.get_or_create(
            title=TEST_PROJECT_NAME, owner=user.get_profile())
    (test_project_2, project_created) = Project.objects.get_or_create(
            title=SV_PROJECT_NAME, owner=user.get_profile())

    ### Create some reference genomes
    ref_genome_1 = import_reference_genome_from_local_file(
            test_project, REF_GENOME_1_LABEL, TEST_FASTA, 'fasta')

    ref_genome_2 = import_reference_genome_from_local_file(
            test_project, REF_GENOME_2_LABEL, TEST_FASTA, 'fasta')

    ref_genome_3 = import_reference_genome_from_local_file(
            test_project, 'test_genome', TEST_FASTA, 'fasta')

    ### Create some saved queries.
    for saved_query_text in CUSTOM_SAVED_QUERY_LIST:
        SavedVariantFilterQuery.objects.get_or_create(
                owner=user.get_profile(),
                text=saved_query_text)

    ### Create some ExperimentSamples.

    # Create some samples without backing data just to explore the UI.
    ExperimentSample.objects.create(
            project=test_project,
            label='C321D_MiSeq',
            data = {'SAMPLE_WELL': 'A01'}
    )

    ExperimentSample.objects.create(
            project=test_project,
            label='C321D Fixed 01',
            data = {'SAMPLE_WELL': 'A02'}
    )

    ExperimentSample.objects.create(
            project=test_project,
            label='C321D Fixed 02',
            data = {'SAMPLE_WELL': 'A03'}
    )

    # Create some samples with backing data.
    (sample_1, created) = ExperimentSample.objects.get_or_create(
            project=test_project,
            label=SAMPLE_1_LABEL)
    # Add datasets to the samples.
    if not sample_1.dataset_set.filter(type=Dataset.TYPE.FASTQ1):
        copy_and_add_dataset_source(sample_1, Dataset.TYPE.FASTQ1,
                Dataset.TYPE.FASTQ1, TEST_FASTQ1)
    if not sample_1.dataset_set.filter(type=Dataset.TYPE.FASTQ2):
        copy_and_add_dataset_source(sample_1, Dataset.TYPE.FASTQ2,
                Dataset.TYPE.FASTQ2, TEST_FASTQ2)

    # Create sample backed by g-zipped data.
    gz_backed_sample = ExperimentSample.objects.create(
            project=test_project,
            label='sample backed by gz data')
    gz_fastq1_dataset = copy_and_add_dataset_source(
            gz_backed_sample, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ1,
            TEST_FASTQ_GZ_1)
    gz_fastq2_dataset = copy_and_add_dataset_source(
            gz_backed_sample, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ2,
            TEST_FASTQ_GZ_2)
    run_fastqc_on_sample_fastq(gz_backed_sample, gz_fastq1_dataset)
    run_fastqc_on_sample_fastq(gz_backed_sample, gz_fastq2_dataset, rev=True)

    ### Create an alignment.
    alignment_group_1 = AlignmentGroup.objects.create(
            label='Alignment 1',
            reference_genome=ref_genome_3,
            aligner=AlignmentGroup.ALIGNER.BWA)
    # Link it to a sample.
    sample_alignment = ExperimentSampleToAlignment.objects.create(
            alignment_group=alignment_group_1,
            experiment_sample=sample_1)
    ### Add alignment data. NOTE: Stored in sample model dir.
    # NOTE: This is a bit convoluted. Perhaps it would be better to store alignments
    # in the ExperimentSampleToAlignment directory.
    copy_dest = copy_dataset_to_entity_data_dir(sample_1, TEST_BAM)
    copy_dataset_to_entity_data_dir(sample_1, TEST_BAM_INDEX)
    add_dataset_to_entity(sample_alignment, Dataset.TYPE.BWA_ALIGN,
            Dataset.TYPE.BWA_ALIGN, copy_dest)

    # Create fake variants.
    create_fake_variants_and_variant_sets(ref_genome_1)

    #############################
    # Full VCF Testing (annotated for snpeff, variant filtering, etc)
    #############################

    # Create a new reference genome and samples using full_vcf_test_set
    full_vcf_reference_genome = import_reference_genome_from_local_file(
                test_project, 'mg1655_tolC_through_zupT',
                FullVCFTestSet.TEST_GENBANK, 'genbank')

    # Create all samples.
    parent_obj = None
    full_vcf_samples = []
    for i in range(FullVCFTestSet.NUM_SAMPLES):
        sample_obj = ExperimentSample.objects.create(
                project=test_project,
                label='Sample %d' % i)

        sample_obj.data['SAMPLE_WELL'] = 'A0%d' % (i+1)

        if i == 0:
            parent_obj = sample_obj
        if i > 0:
            sample_obj.data['SAMPLE_PARENTS'] = parent_obj.label
            parent_obj.add_child(sample_obj)
            parent_obj.save()

        sample_obj.save()

        # Add raw reads to each sample.
        fastq1_dataset = copy_and_add_dataset_source(sample_obj,
                Dataset.TYPE.FASTQ1,
                Dataset.TYPE.FASTQ1,
                FullVCFTestSet.FASTQ1[i])
        fastq2_dataset = copy_and_add_dataset_source(sample_obj,
                Dataset.TYPE.FASTQ2,
                Dataset.TYPE.FASTQ2,
                FullVCFTestSet.FASTQ2[i])

        # Run FASTQC on sample reads.
        run_fastqc_on_sample_fastq(sample_obj, fastq1_dataset)
        run_fastqc_on_sample_fastq(sample_obj, fastq2_dataset, rev=True)

        full_vcf_samples.append(sample_obj)

    # Run the alignment. Return the alignment group created, indexed by the
    # reference genome's uid.
    (full_vcf_alignment_group, pipeline_async_result) = run_pipeline(
            'test_align', full_vcf_reference_genome, full_vcf_samples)

    import_variant_set_from_vcf(full_vcf_reference_genome, 'Designed',
            FullVCFTestSet.TEST_DESIGNED_SNPS)

    def _create_region_intervals(region, interval_tuple_list):
        """Helper method to create RegionIntervals for a Region.

        Args:
            region: Region Model object.
            interval_tuple_list: List of tuples of intervals to create.
        """
        for interval in interval_tuple_list:
            RegionInterval.objects.create(
                    region=region,
                    start=interval[0],
                    end=interval[1])

    # Create some fake regions.
    # TODO: Should not be much harder to replace this with real regions.
    region_1 = Region.objects.create(
        reference_genome=full_vcf_reference_genome,
        label='region_1',
        type=Region.TYPE.POOR_MAPPING_QUALITY)
    _create_region_intervals(region_1, [(1,150), (300, 400), (500, 900)])

    region_2 = Region.objects.create(
        reference_genome=full_vcf_reference_genome,
        label='region_2',
        type=Region.TYPE.POOR_MAPPING_QUALITY)
    _create_region_intervals(region_2, [(1000, 1500)])

    region_3 = Region.objects.create(
        reference_genome=full_vcf_reference_genome,
        label='region_3',
        type=Region.TYPE.POOR_MAPPING_QUALITY)
    _create_region_intervals(region_3, [(1800, 1900), (2150, 2300)])

    # And some GENE regions.

    gene_A = Region.objects.create(
        reference_genome=full_vcf_reference_genome,
        label='geneA',
        type=Region.TYPE.GENE)
    _create_region_intervals(gene_A, [(2000, 2400)])

    gene_B = Region.objects.create(
        reference_genome=full_vcf_reference_genome,
        label='geneB',
        type=Region.TYPE.GENE)
    _create_region_intervals(gene_B, [(4800, 5200)])

    gene_C = Region.objects.create(
        reference_genome=full_vcf_reference_genome,
        label='geneC',
        type=Region.TYPE.GENE)
    _create_region_intervals(gene_C, [(1, 500)])

    # Bootstrap test_project_2 with SV stuff
    sv_testing_bootstrap(test_project_2)
Пример #26
0
def _start_new_alignment(request, project):
    """Delegate function that handles logic of kicking off alignment.
    """
    # Parse the data from the request body.
    request_data = json.loads(request.body)

    # Make sure the required keys are present.
    REQUIRED_KEYS = [
            'name', 'refGenomeUidList', 'sampleUidList', 'skipHetOnly',
            'callAsHaploid']

    if not all(key in request_data for key in REQUIRED_KEYS):
        return HttpResponseBadRequest("Invalid request. Missing keys.")

    try:
        # Parse the data and look up the relevant model instances.
        alignment_group_name = request_data['name']
        assert len(alignment_group_name), "Name required."

        ref_genome_list = ReferenceGenome.objects.filter(
                project=project,
                uid__in=request_data['refGenomeUidList'])
        assert (len(ref_genome_list) ==
                len(request_data['refGenomeUidList'])), (
                        "Invalid reference genome uid(s).")
        assert len(ref_genome_list) == 1, (
            "Exactly one reference genome must be provided.")
        ref_genome = ref_genome_list[0]

        # Make sure AlignmentGroup has a unique name, because run_pipeline
        # will re-use an alignment based on label, reference genome,
        # aligner. We are currently hard-coding the aligner to BWA.
        assert AlignmentGroup.objects.filter(
                label=alignment_group_name,
                reference_genome=ref_genome).count() == 0, (
                        "Please pick unique alignment name.")

        sample_list = ExperimentSample.objects.filter(
                project=project,
                uid__in=request_data['sampleUidList'])
        assert len(sample_list) == len(request_data['sampleUidList']), (
                "Invalid expeirment sample uid(s).")
        assert len(sample_list) > 0, "At least one sample required."

        # Populate alignment options.
        alignment_options = dict()
        if request_data['skipHetOnly']:
            alignment_options['skip_het_only'] = True

        if request_data['callAsHaploid']:
            alignment_options['call_as_haploid'] = True

        # Kick off alignments.
        run_pipeline(
                alignment_group_name,
                ref_genome, sample_list,
                alignment_options=alignment_options)

        # Success. Return a redirect response.
        response_data = {
            'redirect': reverse(
                    'main.views.alignment_list_view',
                    args=(project.uid,)),
        }
    except Exception as e:
        response_data = {
            'error': str(e)
        }

    return HttpResponse(json.dumps(response_data),
            content_type='application/json')
Пример #27
0
def _rerun_alignment(alignment_group):
    """Re-runs existing alignment.
    """
    run_pipeline(alignment_group.label, alignment_group.reference_genome,
            alignment_group.get_samples())
    return HttpResponse(json.dumps({}), content_type='application/json')