示例#1
0
    def _fastqc_test_runner(self, fastq1_location, fastq2_location):
        """Helper that takes different fastqs as source.

        This function is a test itself.
        """
        # Run FastQC
        gz_backed_sample = self.common_entities['sample_1']
        gz_fastq1_dataset = copy_and_add_dataset_source(
                gz_backed_sample, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ1,
                fastq1_location)
        gz_fastq2_dataset = copy_and_add_dataset_source(
                gz_backed_sample, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ2,
                fastq2_location)
        run_fastqc_on_sample_fastq(gz_backed_sample, gz_fastq1_dataset)
        run_fastqc_on_sample_fastq(gz_backed_sample, gz_fastq2_dataset,
                rev=True)

        # We expect 2 Dataset per Fastq so 4 total.
        self.assertEqual(4, Dataset.objects.count())

        # Check link matches file extension.
        FASTQC_DATASET_TYPES = [
                Dataset.TYPE.FASTQC1_HTML, Dataset.TYPE.FASTQC2_HTML]
        for fastqc_dataset_type in FASTQC_DATASET_TYPES:
            fastqc_1_dataset = get_dataset_with_type(
                    gz_backed_sample, fastqc_dataset_type)
            assert os.path.exists(fastqc_1_dataset.get_absolute_location())
    def _fastqc_test_runner(self, fastq1_location, fastq2_location):
        """Helper that takes different fastqs as source.

        This function is a test itself.
        """
        # Run FastQC
        gz_backed_sample = self.common_entities['sample_1']
        gz_fastq1_dataset = copy_and_add_dataset_source(
                gz_backed_sample, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ1,
                fastq1_location)
        gz_fastq2_dataset = copy_and_add_dataset_source(
                gz_backed_sample, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ2,
                fastq2_location)
        run_fastqc_on_sample_fastq(gz_backed_sample, gz_fastq1_dataset)
        run_fastqc_on_sample_fastq(gz_backed_sample, gz_fastq2_dataset,
                rev=True)

        # We expect 2 Dataset per Fastq so 4 total.
        self.assertEqual(4, Dataset.objects.count())

        # Check link matches file extension.
        FASTQC_DATASET_TYPES = [
                Dataset.TYPE.FASTQC1_HTML, Dataset.TYPE.FASTQC2_HTML]
        for fastqc_dataset_type in FASTQC_DATASET_TYPES:
            fastqc_1_dataset = get_dataset_with_type(
                    gz_backed_sample, fastqc_dataset_type)
            assert os.path.exists(fastqc_1_dataset.get_absolute_location())
    def _create_alignment(self, haploid=False):

        # Create a new alignment group.
        self.alignment_group = AlignmentGroup.objects.create(
                label='test alignment', reference_genome=self.REFERENCE_GENOME)

        if haploid:
            self.alignment_group.alignment_options['call_as_haploid'] = True

        # Create a sample.
        self.sample_1 = ExperimentSample.objects.create(
                uid=self.FAKE_READS_SAMPLE_UID,
                project=self.project,
                label='sample1')
        ### Add the raw reads
        copy_and_add_dataset_source(self.sample_1, Dataset.TYPE.FASTQ1,
                Dataset.TYPE.FASTQ1, self.FAKE_READS_FASTQ1)
        copy_and_add_dataset_source(self.sample_1, Dataset.TYPE.FASTQ2,
                Dataset.TYPE.FASTQ2, self.FAKE_READS_FASTQ2)

        # Create alignment to the sample.
        sample_alignment = ExperimentSampleToAlignment.objects.create(
                alignment_group=self.alignment_group,
                experiment_sample=self.sample_1)
        ### Add alignment data. NOTE: Stored in sample model dir.
        copy_dest = copy_dataset_to_entity_data_dir(
                self.sample_1, self.FAKE_READS_BAM)
        copy_dataset_to_entity_data_dir(self.sample_1,
                self.FAKE_READS_BAM_INDEX)
        add_dataset_to_entity(sample_alignment, Dataset.TYPE.BWA_ALIGN,
                Dataset.TYPE.BWA_ALIGN, copy_dest)
示例#4
0
    def test_run_pipeline__snps_with_effect__no_svs(self):
        """Tests pipeline with SNPs with effect, but no SVs called.
        """
        ref_genome = import_reference_genome_from_local_file(
            self.project, 'mg1655_tolC_through_zupT',
            FullVCFTestSet.TEST_GENBANK, 'genbank')

        sample_obj = ExperimentSample.objects.create(project=self.project,
                                                     label='Sample %d' % 0)

        # Add raw reads to each sample.
        copy_and_add_dataset_source(sample_obj, Dataset.TYPE.FASTQ1,
                                    Dataset.TYPE.FASTQ1,
                                    FullVCFTestSet.FASTQ1[0])
        copy_and_add_dataset_source(sample_obj, Dataset.TYPE.FASTQ2,
                                    Dataset.TYPE.FASTQ2,
                                    FullVCFTestSet.FASTQ2[0])

        result = run_pipeline('test_align', ref_genome, [sample_obj])

        alignment_group = result[0]
        alignment_async_result = result[1]
        variant_calling_async_result = result[2]
        alignment_async_result.get()
        variant_calling_async_result.get()
        alignment_group = AlignmentGroup.objects.get(uid=alignment_group.uid)
        self.assertEqual(AlignmentGroup.STATUS.COMPLETED,
                         alignment_group.status)

        # Check that SnpEff worked.
        v_205 = Variant.objects.get(
            reference_genome=alignment_group.reference_genome, position=205)
        v_205_va = v_205.variantalternate_set.all()[0]
        self.assertEqual('tolC', v_205_va.data['INFO_EFF_GENE'])
示例#5
0
    def setUp(self):
        user = User.objects.create_user(TEST_USERNAME, password=TEST_PASSWORD,
                email=TEST_EMAIL)
        self.project = Project.objects.create(owner=user.get_profile(),
                title='Test Project')

        # Create a ref genome.
        self.reference_genome = import_reference_genome_from_local_file(
                self.project, 'ref_genome', TEST_FASTA, 'fasta')

        # Create a sample.
        self.experiment_sample = ExperimentSample.objects.create(
                project=self.project, label='sample1')
        copy_and_add_dataset_source(self.experiment_sample, Dataset.TYPE.FASTQ1,
                Dataset.TYPE.FASTQ1, TEST_FASTQ1)
        copy_and_add_dataset_source(self.experiment_sample, Dataset.TYPE.FASTQ2,
                Dataset.TYPE.FASTQ2, TEST_FASTQ2)

        # Create second sample.
        self.experiment_sample_2 = ExperimentSample.objects.create(
                project=self.project, label='sample2')
        copy_and_add_dataset_source(
                self.experiment_sample_2, Dataset.TYPE.FASTQ1,
                Dataset.TYPE.FASTQ1, TEST_SAMPLE_2_FASTQ1)
        copy_and_add_dataset_source(
                self.experiment_sample_2, Dataset.TYPE.FASTQ2,
                Dataset.TYPE.FASTQ2, TEST_SAMPLE_2_FASTQ2)

        # Create a sample with a single fastq
        self.experiment_sample_single_fastq = ExperimentSample.objects.create(
                project=self.project, label='sample_single_fastq')

        # Add the fastq file to the sample
        copy_and_add_dataset_source(self.experiment_sample_single_fastq,
                Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ1, TEST_FASTQ1)
    def _create_alignment(self, haploid=False):

        # Create a new alignment group.
        self.alignment_group = AlignmentGroup.objects.create(
            label='test alignment', reference_genome=self.REFERENCE_GENOME)

        if haploid:
            self.alignment_group.alignment_options['call_as_haploid'] = True

        # Create a sample.
        self.sample_1 = ExperimentSample.objects.create(
            uid=self.FAKE_READS_SAMPLE_UID,
            project=self.project,
            label='sample1')
        ### Add the raw reads
        copy_and_add_dataset_source(self.sample_1, Dataset.TYPE.FASTQ1,
                                    Dataset.TYPE.FASTQ1,
                                    self.FAKE_READS_FASTQ1)
        copy_and_add_dataset_source(self.sample_1, Dataset.TYPE.FASTQ2,
                                    Dataset.TYPE.FASTQ2,
                                    self.FAKE_READS_FASTQ2)

        # Create alignment to the sample.
        sample_alignment = ExperimentSampleToAlignment.objects.create(
            alignment_group=self.alignment_group,
            experiment_sample=self.sample_1)
        ### Add alignment data. NOTE: Stored in sample model dir.
        copy_dest = copy_dataset_to_entity_data_dir(self.sample_1,
                                                    self.FAKE_READS_BAM)
        copy_dataset_to_entity_data_dir(self.sample_1,
                                        self.FAKE_READS_BAM_INDEX)
        add_dataset_to_entity(sample_alignment, Dataset.TYPE.BWA_ALIGN,
                              Dataset.TYPE.BWA_ALIGN, copy_dest)
示例#7
0
    def setUp(self):
        user = User.objects.create_user('test_username_sv', password='******',
                email='*****@*****.**')

        # Grab a project.
        self.project = Project.objects.create(title='test project',
                owner=user.get_profile())

        # Use genome with deletion from our sv testing repo:
        # https://github.com/churchlab/structural-variants-testing
        DELETION_TEST_DATA_DIR = os.path.join(settings.PWD, 'test_data',
                'sv_testing', 'deletion_bd5a1123')
        REF = os.path.join(DELETION_TEST_DATA_DIR, 'small_ref.fa')
        FASTQ1 = os.path.join(DELETION_TEST_DATA_DIR, 'deletion_bd5a1123.1.fq')
        FASTQ2 = os.path.join(DELETION_TEST_DATA_DIR, 'deletion_bd5a1123.2.fq')

        # Create Datasets / import data.
        self.reference_genome = import_reference_genome_from_local_file(
                self.project, 'ref_genome', REF, 'fasta')
        self.experiment_sample = ExperimentSample.objects.create(
                project=self.project, label='sample1')
        copy_and_add_dataset_source(self.experiment_sample, Dataset.TYPE.FASTQ1,
                Dataset.TYPE.FASTQ1, FASTQ1)
        copy_and_add_dataset_source(self.experiment_sample, Dataset.TYPE.FASTQ2,
                Dataset.TYPE.FASTQ2, FASTQ2)
示例#8
0
    def setUp(self):
        user = User.objects.create_user(TEST_USERNAME,
                                        password=TEST_PASSWORD,
                                        email=TEST_EMAIL)
        self.project = Project.objects.create(owner=user.get_profile(),
                                              title='Test Project')

        # Create a ref genome.
        self.reference_genome = import_reference_genome_from_local_file(
            self.project, 'ref_genome', TEST_FASTA, 'fasta')

        # Create a sample.
        self.experiment_sample = ExperimentSample.objects.create(
            project=self.project, label='sample1')

        # Create a sample for compressed fastq data.
        self.compressed_experiment_sample = ExperimentSample.objects.create(
            project=self.project, label='sample1')

        # Add fastq files to first sample.
        copy_and_add_dataset_source(self.experiment_sample,
                                    Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ1,
                                    TEST_FASTQ1)
        copy_and_add_dataset_source(self.experiment_sample,
                                    Dataset.TYPE.FASTQ2, Dataset.TYPE.FASTQ2,
                                    TEST_FASTQ2)

        # Add compressed fastq files to second sample.
        copy_and_add_dataset_source(self.compressed_experiment_sample,
                                    Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ1,
                                    TEST_FASTQ1_GZ)
        copy_and_add_dataset_source(self.compressed_experiment_sample,
                                    Dataset.TYPE.FASTQ2, Dataset.TYPE.FASTQ2,
                                    TEST_FASTQ2_GZ)

        # Create a new alignment group.
        alignment_group = AlignmentGroup.objects.create(
            label='test alignment', reference_genome=self.reference_genome)

        # Create the expected models.
        sample_alignment = ExperimentSampleToAlignment.objects.create(
            alignment_group=alignment_group,
            experiment_sample=self.experiment_sample)

        bwa_dataset = copy_and_add_dataset_source(
            sample_alignment,
            dataset_label=Dataset.TYPE.BWA_ALIGN,
            dataset_type=Dataset.TYPE.BWA_ALIGN,
            original_source_location=TEST_DISC_SPLIT_BAM)

        bwa_dataset.status = status = Dataset.STATUS.READY
        bwa_dataset.save()

        index_bam_file(bwa_dataset.get_absolute_location())

        self.bwa_dataset = bwa_dataset
        self.sample_alignment = sample_alignment
示例#9
0
def create_sample_and_alignment(project,
                                alignment_group,
                                sample_uid,
                                bwa_alignment=None):
    sample = ExperimentSample.objects.create(uid=sample_uid,
                                             project=project,
                                             label=sample_uid)
    sample_alignment = ExperimentSampleToAlignment.objects.create(
        alignment_group=alignment_group, experiment_sample=sample)
    if bwa_alignment is not None:
        copy_and_add_dataset_source(sample_alignment, Dataset.TYPE.BWA_ALIGN,
                                    Dataset.TYPE.BWA_ALIGN, bwa_alignment)
    return {'sample': sample, 'sample_alignment': sample_alignment}
示例#10
0
    def setUp(self):
        user = User.objects.create_user(TEST_USERNAME,
                                        password=TEST_PASSWORD,
                                        email=TEST_EMAIL)
        self.project = Project.objects.create(owner=user.get_profile(),
                                              title='Test Project')

        # Create a ref genome.
        self.reference_genome = import_reference_genome_from_local_file(
            self.project, 'ref_genome', TEST_FASTA, 'fasta')

        # Create a sample.
        self.experiment_sample = ExperimentSample.objects.create(
            project=self.project, label='sample1')

        # Create a sample for compressed fastq data.
        self.compressed_experiment_sample = ExperimentSample.objects.create(
            project=self.project, label='sample1')

        # Add fastq files to first sample.
        copy_and_add_dataset_source(self.experiment_sample,
                                    Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ1,
                                    TEST_FASTQ1)
        copy_and_add_dataset_source(self.experiment_sample,
                                    Dataset.TYPE.FASTQ2, Dataset.TYPE.FASTQ2,
                                    TEST_FASTQ2)

        # Add compressed fastq files to second sample.
        copy_and_add_dataset_source(self.compressed_experiment_sample,
                                    Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ1,
                                    TEST_FASTQ1_GZ)
        copy_and_add_dataset_source(self.compressed_experiment_sample,
                                    Dataset.TYPE.FASTQ2, Dataset.TYPE.FASTQ2,
                                    TEST_FASTQ2_GZ)
示例#11
0
    def setUp(self):
        common_entities = create_common_entities()
        self.project = common_entities['project']
        self.reference_genome = import_reference_genome_from_local_file(
            self.project, 'ref_genome', TEST_FASTA, 'fasta')

        self.experiment_sample = ExperimentSample.objects.create(
            project=self.project, label='sample1')
        copy_and_add_dataset_source(self.experiment_sample,
                                    Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ1,
                                    TEST_FASTQ1)
        copy_and_add_dataset_source(self.experiment_sample,
                                    Dataset.TYPE.FASTQ2, Dataset.TYPE.FASTQ2,
                                    TEST_FASTQ2)
    def setUp(self):
        user = User.objects.create_user(TEST_USERNAME, password=TEST_PASSWORD,
                email=TEST_EMAIL)
        self.project = Project.objects.create(owner=user.get_profile(),
                title='Test Project')

        # Create a ref genome.
        self.reference_genome = import_reference_genome_from_local_file(
                self.project, 'ref_genome', TEST_FASTA, 'fasta')

        # Create a sample.
        self.experiment_sample = ExperimentSample.objects.create(
                project=self.project, label='sample1')

        # Create a sample for compressed fastq data.
        self.compressed_experiment_sample = ExperimentSample.objects.create(
                project=self.project, label='sample1')

        # Add fastq files to first sample.
        copy_and_add_dataset_source(self.experiment_sample, Dataset.TYPE.FASTQ1,
                Dataset.TYPE.FASTQ1, TEST_FASTQ1)
        copy_and_add_dataset_source(self.experiment_sample, Dataset.TYPE.FASTQ2,
                Dataset.TYPE.FASTQ2, TEST_FASTQ2)

        # Add compressed fastq files to second sample.
        copy_and_add_dataset_source(self.compressed_experiment_sample,
                Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ1, TEST_FASTQ1_GZ)
        copy_and_add_dataset_source(self.compressed_experiment_sample,
                Dataset.TYPE.FASTQ2, Dataset.TYPE.FASTQ2, TEST_FASTQ2_GZ)
示例#13
0
def create_sample_and_alignment(
        project, alignment_group, sample_uid, bwa_alignment=None):
    sample = ExperimentSample.objects.create(
            uid=sample_uid, project=project, label=sample_uid)
    sample_alignment = ExperimentSampleToAlignment.objects.create(
            alignment_group=alignment_group, experiment_sample=sample)
    if bwa_alignment is not None:
        copy_and_add_dataset_source(
                sample_alignment, Dataset.TYPE.BWA_ALIGN, Dataset.TYPE.BWA_ALIGN,
                bwa_alignment)
    return {
        'sample': sample,
        'sample_alignment': sample_alignment
    }
示例#14
0
    def test_parser_skip_het(self):
        """Test that skipping het_only variants works.
        """
        VCF_DATATYPE = Dataset.TYPE.VCF_FREEBAYES
        alignment_group = AlignmentGroup.objects.create(
                label='test alignment', reference_genome=self.reference_genome)
        copy_and_add_dataset_source(alignment_group, VCF_DATATYPE,
                VCF_DATATYPE, TEST_GENOME_SNPS)

        Chromosome.objects.create(
            reference_genome=self.reference_genome,
            label='Chromosome',
            num_bases=9001)

        # Create experiment sample objects having UIDs that correspond to those
        # in the vcf file. This is a bit "fake" in that the actual pipeline we
        # will be generating the vcf file from the samples (see add_groups()
        # stage of pipeline.
        with open(TEST_GENOME_SNPS) as fh:
            reader = vcf.Reader(fh)
            experiment_sample_uids = reader.samples
        num_experiment_samples = len(experiment_sample_uids)
        for sample_uid in experiment_sample_uids:
            ExperimentSample.objects.create(
                uid=sample_uid,
                project=self.project,
                label='fakename:' + sample_uid
            )

        # Count the number of records in the vcf file for testing.
        record_count = 0
        with open(TEST_GENOME_SNPS) as fh:
            for record in vcf.Reader(fh):
                if sum([s.gt_type == 2 for s in record.samples]) > 0:
                    record_count += 1

        # Turn on het-only skipping
        alignment_group.alignment_options['skip_het_only'] = True
        alignment_group.save()

        # Parse the vcf
        parse_alignment_group_vcf(alignment_group, VCF_DATATYPE)

        variant_list = Variant.objects.filter(
                reference_genome=self.reference_genome)

        # There should be one Variant object for each record that is not het.
        self.assertEqual(record_count, len(variant_list))
示例#15
0
    def test_run_pipeline__multiple_chromosomes(self):
        """Makes sure variant calling works when there are multiple chromosomes
        on a single reference genome.
        """
        ref_genome = import_reference_genome_from_local_file(
                self.project, 'concat_mg1655_partials',
                FullVCFTestSet.TEST_CONCAT_GENBANK, 'genbank')


        sample_obj = ExperimentSample.objects.create(
                project=self.project,
                label='Sample 0')

        # Add raw reads to each sample.
        copy_and_add_dataset_source(sample_obj,
                Dataset.TYPE.FASTQ1,
                Dataset.TYPE.FASTQ1,
                FullVCFTestSet.FASTQ1[0])
        copy_and_add_dataset_source(sample_obj,
                Dataset.TYPE.FASTQ2,
                Dataset.TYPE.FASTQ2,
                FullVCFTestSet.FASTQ2[0])

        sample_list = [sample_obj]

        result = run_pipeline(
                'name_placeholder', ref_genome, sample_list)
        alignment_group = result[0]
        alignment_async_result = result[1]
        variant_calling_async_result = result[2]
        alignment_async_result.get()
        variant_calling_async_result.get()
        alignment_group = AlignmentGroup.objects.get(uid=alignment_group.uid)
        self.assertEqual(AlignmentGroup.STATUS.COMPLETED,
                alignment_group.status)

        # Validate that all variants calld.
        # TODO: Add Chromosome checking.

        v_515 = Variant.objects.get(
                reference_genome=alignment_group.reference_genome, position=515)
        v_515_va = v_515.variantalternate_set.all()[0]
        self.assertEqual('ygiB', v_515_va.data['INFO_EFF_GENE'])

        v_205 = Variant.objects.get(
                reference_genome=alignment_group.reference_genome, position=205)
        v_205_va = v_205.variantalternate_set.all()[0]
        self.assertEqual('tolC', v_205_va.data['INFO_EFF_GENE'])
示例#16
0
    def setUp(self):
        user = User.objects.create_user(TEST_USERNAME, password=TEST_PASSWORD,
                email=TEST_EMAIL)
        self.project = Project.objects.create(owner=user.get_profile(),
                title='Test Project')

        # Create a ref genome.
        self.reference_genome = import_reference_genome_from_local_file(
                self.project, 'ref_genome', TEST_FASTA, 'fasta')

        # Create a sample.
        self.experiment_sample = ExperimentSample.objects.create(
                project=self.project, label='sample1')

        # Create a sample for compressed fastq data.
        self.compressed_experiment_sample = ExperimentSample.objects.create(
                project=self.project, label='sample1')

        # Add fastq files to first sample.
        copy_and_add_dataset_source(self.experiment_sample, Dataset.TYPE.FASTQ1,
                Dataset.TYPE.FASTQ1, TEST_FASTQ1)
        copy_and_add_dataset_source(self.experiment_sample, Dataset.TYPE.FASTQ2,
                Dataset.TYPE.FASTQ2, TEST_FASTQ2)

        # Add compressed fastq files to second sample.
        copy_and_add_dataset_source(self.compressed_experiment_sample,
                Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ1, TEST_FASTQ1_GZ)
        copy_and_add_dataset_source(self.compressed_experiment_sample,
                Dataset.TYPE.FASTQ2, Dataset.TYPE.FASTQ2, TEST_FASTQ2_GZ)


        # Create a new alignment group.
        alignment_group = AlignmentGroup.objects.create(
                label='test alignment', reference_genome=self.reference_genome)

        # Create the expected models.
        sample_alignment = ExperimentSampleToAlignment.objects.create(
                alignment_group=alignment_group,
                experiment_sample=self.experiment_sample)

        bwa_dataset = copy_and_add_dataset_source(
                sample_alignment,
                dataset_label=Dataset.TYPE.BWA_ALIGN,
                dataset_type=Dataset.TYPE.BWA_ALIGN,
                original_source_location=TEST_DISC_SPLIT_BAM)

        bwa_dataset.status = status=Dataset.STATUS.READY
        bwa_dataset.save()

        index_bam_file(bwa_dataset.get_absolute_location())

        self.bwa_dataset = bwa_dataset
        self.sample_alignment = sample_alignment
def sv_testing_bootstrap(project):
    sv_testing_dir = os.path.join(GD_ROOT, 'test_data', 'sv_testing', 'all_svs')
    fasta = os.path.join(sv_testing_dir, 'ref.fa')
    fq1 = os.path.join(sv_testing_dir, 'simLibrary.1.fq')
    fq2 = os.path.join(sv_testing_dir, 'simLibrary.2.fq')

    ref_genome = import_reference_genome_from_local_file(
            project, 'ref', fasta, 'fasta')

    sample = ExperimentSample.objects.create(
            project=project,
            label='simLibrary',
    )
    copy_and_add_dataset_source(sample, Dataset.TYPE.FASTQ1,
            Dataset.TYPE.FASTQ1, fq1)
    copy_and_add_dataset_source(sample, Dataset.TYPE.FASTQ2,
            Dataset.TYPE.FASTQ2, fq2)

    if '--sv' in sys.argv:  # using --sv argument runs pipeline for SV project
        run_pipeline('sample_alignment_ref', ref_genome, [sample])
示例#18
0
def sv_testing_bootstrap(project):
    sv_testing_dir = os.path.join(GD_ROOT, 'test_data', 'sv_testing',
                                  'all_svs')
    fasta = os.path.join(sv_testing_dir, 'ref.fa')
    fq1 = os.path.join(sv_testing_dir, 'simLibrary.1.fq')
    fq2 = os.path.join(sv_testing_dir, 'simLibrary.2.fq')

    ref_genome = import_reference_genome_from_local_file(
        project, 'ref', fasta, 'fasta')

    sample = ExperimentSample.objects.create(
        project=project,
        label='simLibrary',
    )
    copy_and_add_dataset_source(sample, Dataset.TYPE.FASTQ1,
                                Dataset.TYPE.FASTQ1, fq1)
    copy_and_add_dataset_source(sample, Dataset.TYPE.FASTQ2,
                                Dataset.TYPE.FASTQ2, fq2)

    if '--sv' in sys.argv:  # using --sv argument runs pipeline for SV project
        run_pipeline('sample_alignment_ref', ref_genome, [sample])
示例#19
0
    def test_run_pipeline__snps_with_effect__no_svs(self):
        """Tests pipeline with SNPs with effect, but no SVs called.
        """
        ref_genome = import_reference_genome_from_local_file(
                self.project, 'mg1655_tolC_through_zupT',
                FullVCFTestSet.TEST_GENBANK, 'genbank')

        sample_obj = ExperimentSample.objects.create(
                project=self.project,
                label='Sample %d' % 0)

        # Add raw reads to each sample.
        copy_and_add_dataset_source(sample_obj,
                Dataset.TYPE.FASTQ1,
                Dataset.TYPE.FASTQ1,
                FullVCFTestSet.FASTQ1[0])
        copy_and_add_dataset_source(sample_obj,
                Dataset.TYPE.FASTQ2,
                Dataset.TYPE.FASTQ2,
                FullVCFTestSet.FASTQ2[0])

        result = run_pipeline(
            'test_align', ref_genome, [sample_obj])

        alignment_group = result[0]
        alignment_async_result = result[1]
        variant_calling_async_result = result[2]
        alignment_async_result.get()
        variant_calling_async_result.get()
        alignment_group = AlignmentGroup.objects.get(uid=alignment_group.uid)
        self.assertEqual(AlignmentGroup.STATUS.COMPLETED,
                alignment_group.status)

        # Check that SnpEff worked.
        v_205 = Variant.objects.get(
                reference_genome=alignment_group.reference_genome, position=205)
        v_205_va = v_205.variantalternate_set.all()[0]
        self.assertEqual('tolC', v_205_va.data['INFO_EFF_GENE'])
示例#20
0
    def test_parser__sv_lumpy(self):
        """Tests parsing lumpy output which contains SV data.
        """
        DELETION_TEST_DATA_DIR = os.path.join(TEST_DATA_DIR,
                'sv_testing', 'deletion_bd5a1123')
        DELETION_REF_FASTA = os.path.join(
                DELETION_TEST_DATA_DIR, 'small_ref.fa')

        DELETION_SAMPLE_1_UID = 'ds1'
        DELETION_SAMPLE_2_UID = 'ds2'
        DELETION_SAMPLE_3_UID = 'ds3'
        DELETION_SAMPLE_4_UID = 'f8346a99'

        reference_genome = import_reference_genome_from_local_file(
                self.project, 'ref_genome', DELETION_REF_FASTA, 'fasta')

        alignment_group = AlignmentGroup.objects.create(
                label='Alignment 1', reference_genome=reference_genome,
                aligner=AlignmentGroup.ALIGNER.BWA)

        # Connect lumpy vcf as Dataset.
        lumpy_vcf_dataset = copy_and_add_dataset_source(
                alignment_group, Dataset.TYPE.VCF_LUMPY, Dataset.TYPE.VCF_LUMPY,
                LUMPY_4_SAMPLES_2_DELETIONS_VCF)

        # Create samples corresponding to sample ids in vcf.
        create_sample_and_alignment(
                self.project, alignment_group, DELETION_SAMPLE_1_UID)
        create_sample_and_alignment(
                self.project, alignment_group, DELETION_SAMPLE_2_UID)
        create_sample_and_alignment(
                self.project, alignment_group, DELETION_SAMPLE_3_UID)
        create_sample_and_alignment(
                self.project, alignment_group, DELETION_SAMPLE_4_UID)

        # Now we have everything we need to parse the vcf.
        parse_vcf(lumpy_vcf_dataset, alignment_group)

        # Check expected variants.
        v_4998 = Variant.objects.get(
                reference_genome=reference_genome, position=4998)
        v_4998_vccd = v_4998.variantcallercommondata_set.all()[0]
        self.assertTrue(v_4998_vccd.data['IS_SV'])

        v_9999 = Variant.objects.get(
                reference_genome=reference_genome, position=9999)
        v_9999_vccd = v_9999.variantcallercommondata_set.all()[0]
        self.assertTrue(v_9999_vccd.data['IS_SV'])
示例#21
0
def create_recoli_sv_data_from_vcf(project):
    """Populate database with SVs from lumpy vcf.
    """
    VCF_PARSER_TEST_DATA_DIR = os.path.join(TEST_DATA_DIR, 'vcf_parser_test_data')

    LUMPY_4_SAMPLES_RECOLI_VCF = os.path.join(
            VCF_PARSER_TEST_DATA_DIR, 'lumpy_4_samples_recoli.vcf')

    SAMPLE_1_UID = '3990b0f4'
    SAMPLE_2_UID = '0e250e34'
    SAMPLE_3_UID = '396ea926'
    SAMPLE_4_UID = '4a09d3dd'

    reference_genome = ReferenceGenome.objects.create(
            project=project, label='myref')

    Chromosome.objects.create(
            reference_genome=reference_genome,
            label='the chrom',
            seqrecord_id='U00096.2',
            num_bases=5000000000)

    alignment_group = AlignmentGroup.objects.create(
            label='Alignment 1', reference_genome=reference_genome,
            aligner=AlignmentGroup.ALIGNER.BWA)

    # Connect lumpy vcf as Dataset.
    lumpy_vcf_dataset = copy_and_add_dataset_source(
            alignment_group, Dataset.TYPE.VCF_LUMPY, Dataset.TYPE.VCF_LUMPY,
            LUMPY_4_SAMPLES_RECOLI_VCF)

    # Create samples corresponding to sample ids in vcf.
    create_sample_and_alignment(
            project, alignment_group, SAMPLE_1_UID)
    create_sample_and_alignment(
            project, alignment_group, SAMPLE_2_UID)
    create_sample_and_alignment(
            project, alignment_group, SAMPLE_3_UID)
    create_sample_and_alignment(
            project, alignment_group, SAMPLE_4_UID)

    # Now we have everything we need to parse the vcf.
    parse_vcf(lumpy_vcf_dataset, alignment_group)
示例#22
0
def create_recoli_sv_data_from_vcf(project):
    """Populate database with SVs from lumpy vcf.
    """
    VCF_PARSER_TEST_DATA_DIR = os.path.join(TEST_DATA_DIR,
                                            'vcf_parser_test_data')

    LUMPY_4_SAMPLES_RECOLI_VCF = os.path.join(VCF_PARSER_TEST_DATA_DIR,
                                              'lumpy_4_samples_recoli.vcf')

    SAMPLE_1_UID = '3990b0f4'
    SAMPLE_2_UID = '0e250e34'
    SAMPLE_3_UID = '396ea926'
    SAMPLE_4_UID = '4a09d3dd'

    reference_genome = ReferenceGenome.objects.create(project=project,
                                                      label='myref')

    Chromosome.objects.create(reference_genome=reference_genome,
                              label='the chrom',
                              seqrecord_id='U00096.2',
                              num_bases=5000000000)

    alignment_group = AlignmentGroup.objects.create(
        label='Alignment 1',
        reference_genome=reference_genome,
        aligner=AlignmentGroup.ALIGNER.BWA)

    # Connect lumpy vcf as Dataset.
    lumpy_vcf_dataset = copy_and_add_dataset_source(
        alignment_group, Dataset.TYPE.VCF_LUMPY, Dataset.TYPE.VCF_LUMPY,
        LUMPY_4_SAMPLES_RECOLI_VCF)

    # Create samples corresponding to sample ids in vcf.
    create_sample_and_alignment(project, alignment_group, SAMPLE_1_UID)
    create_sample_and_alignment(project, alignment_group, SAMPLE_2_UID)
    create_sample_and_alignment(project, alignment_group, SAMPLE_3_UID)
    create_sample_and_alignment(project, alignment_group, SAMPLE_4_UID)

    # Now we have everything we need to parse the vcf.
    parse_vcf(lumpy_vcf_dataset, alignment_group)
示例#23
0
def create_common_entities_w_variants():
    """Creates the most common entities for testing.

    Returns at a User, Project, ReferenceGenome, alignment,
    and variants that are all related.
    """

    # this is the number of samples in the VCF file
    num_samples = 10

    user = User.objects.create_user(TEST_USERNAME,
                                    password=TEST_PASSWORD,
                                    email=TEST_EMAIL)

    project = Project.objects.create(title=TEST_PROJECT_NAME,
                                     owner=user.get_profile())

    reference_genome = ReferenceGenome.objects.create(
        project=project, label=TEST_REF_GENOME_LABEL)

    chromosome = Chromosome.objects.create(reference_genome=reference_genome,
                                           label='Chromosome',
                                           seqrecord_id='Chromosome',
                                           num_bases=9001)

    alignment_group = AlignmentGroup.objects.create(
        label='Alignment 1',
        reference_genome=reference_genome,
        aligner=AlignmentGroup.ALIGNER.BWA)

    Chromosome.objects.create(reference_genome=reference_genome,
                              label='Chromosome',
                              num_bases=2000)

    VCF_DATATYPE = Dataset.TYPE.VCF_FREEBAYES
    copy_and_add_dataset_source(alignment_group, VCF_DATATYPE, VCF_DATATYPE,
                                TEST_GENOME_SNPS)

    # Create experiment sample objects having UIDs that correspond to those
    # in the vcf file. This is a bit "fake" in that the actual pipeline we
    # will be generating the vcf file from the samples (see add_groups()
    # stage of pipeline.
    with open(TEST_GENOME_SNPS) as fh:
        reader = vcf.Reader(fh)
        experiment_sample_uids = reader.samples
    samples = [
        ExperimentSample.objects.create(uid=sample_uid,
                                        project=project,
                                        label='fakename:' + sample_uid)
        for sample_uid in experiment_sample_uids
    ]

    # add samples to alignment group
    for sample in samples:
        ExperimentSampleToAlignment.objects.get_or_create(
            alignment_group=alignment_group, experiment_sample=sample)

    # Parse the vcf
    parse_alignment_group_vcf(alignment_group, VCF_DATATYPE)

    return {
        'user': user,
        'project': project,
        'reference_genome': reference_genome,
        'chromosome': chromosome,
        'samples': samples,
        'alignment_group': alignment_group
    }
    def test_run_lumpy__deletion(self):
        """Tests running Lumpy on data that should have a deletion.
        """
        TEST_SAMPLE_UID = '38d786f2'

        user = User.objects.create_user('test_username_sv', password='******',
                email='*****@*****.**')

        # Grab a project.
        self.project = Project.objects.create(title='test project',
                owner=user.get_profile())

        # Use genome with deletion from our sv testing repo:
        # https://github.com/churchlab/structural-variants-testing
        DELETION_TEST_DATA_DIR = os.path.join(TEST_DATA_DIR,
                'sv_testing', 'deletion_bd5a1123')
        REF = os.path.join(DELETION_TEST_DATA_DIR, 'small_ref.fa')
        FASTQ1 = os.path.join(DELETION_TEST_DATA_DIR, 'deletion_bd5a1123.1.fq')
        FASTQ2 = os.path.join(DELETION_TEST_DATA_DIR, 'deletion_bd5a1123.2.fq')
        BWA_ALIGNMENT = os.path.join(DELETION_TEST_DATA_DIR,
                'deletion_bd5a1123.bam')

        # Create Datasets / import data.
        self.reference_genome = import_reference_genome_from_local_file(
                self.project, 'ref_genome', REF, 'fasta')
        self.experiment_sample = ExperimentSample.objects.create(
                project=self.project, label='sample1')
        copy_and_add_dataset_source(self.experiment_sample,
                Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ1, FASTQ1)
        copy_and_add_dataset_source(self.experiment_sample,
                Dataset.TYPE.FASTQ2, Dataset.TYPE.FASTQ2, FASTQ2)

        # Create an alignment that's already complete, so we can focus on
        # testing variant calling only.
        self.alignment_group = AlignmentGroup.objects.create(
                label='test alignment', reference_genome=self.reference_genome)

        sample_1 = ExperimentSample.objects.create(
                uid=TEST_SAMPLE_UID,
                project=self.project,
                label='sample1')

        sample_alignment = ExperimentSampleToAlignment.objects.create(
                alignment_group=self.alignment_group,
                experiment_sample=sample_1)
        copy_and_add_dataset_source(sample_alignment, Dataset.TYPE.BWA_ALIGN,
                Dataset.TYPE.BWA_ALIGN, BWA_ALIGNMENT)

        # Run lumpy.
        lumpy_params = VARIANT_TOOL_PARAMS_MAP[TOOL_LUMPY]
        find_variants_with_tool(
                self.alignment_group, lumpy_params, project=self.project)

        # Grab the resulting variants.
        variants = Variant.objects.filter(
                reference_genome=self.reference_genome)

        # Verify that we have the expected deletion around position 10000 of
        # size 1000.
        self.assertEqual(1, len(variants))
        v = variants[0]

        # start position
        self.assertTrue(9950 < v.position < 10050)

        # size
        vccd = v.variantcallercommondata_set.all()[0]
        size = vccd.data['INFO_END'] - v.position
        self.assertTrue(900 < size < 1100)
示例#25
0
def create_common_entities_w_variants():
    """Creates the most common entities for testing.

    Returns at a User, Project, ReferenceGenome, alignment,
    and variants that are all related.
    """

    # this is the number of samples in the VCF file
    num_samples = 10

    user = User.objects.create_user(
            TEST_USERNAME, password=TEST_PASSWORD, email=TEST_EMAIL)

    project = Project.objects.create(
            title=TEST_PROJECT_NAME, owner=user.get_profile())

    reference_genome = ReferenceGenome.objects.create(
            project=project,
            label=TEST_REF_GENOME_LABEL)

    chromosome = Chromosome.objects.create(
            reference_genome=reference_genome,
            label='Chromosome',
            seqrecord_id='Chromosome',
            num_bases=9001)

    alignment_group = AlignmentGroup.objects.create(
            label='Alignment 1',
            reference_genome=reference_genome,
            aligner=AlignmentGroup.ALIGNER.BWA)

    Chromosome.objects.create(
        reference_genome=reference_genome,
        label='Chromosome',
        num_bases=2000)

    VCF_DATATYPE = Dataset.TYPE.VCF_FREEBAYES
    copy_and_add_dataset_source(
                alignment_group, VCF_DATATYPE,
                VCF_DATATYPE, TEST_GENOME_SNPS)

    # Create experiment sample objects having UIDs that correspond to those
    # in the vcf file. This is a bit "fake" in that the actual pipeline we
    # will be generating the vcf file from the samples (see add_groups()
    # stage of pipeline.
    with open(TEST_GENOME_SNPS) as fh:
        reader = vcf.Reader(fh)
        experiment_sample_uids = reader.samples
    samples = [ExperimentSample.objects.create(
                    uid=sample_uid,
                    project=project,
                    label='fakename:' + sample_uid)
            for sample_uid in experiment_sample_uids]

    # add samples to alignment group
    for sample in samples:
        ExperimentSampleToAlignment.objects.get_or_create(
                alignment_group=alignment_group,
                experiment_sample=sample)


    # Parse the vcf
    parse_alignment_group_vcf(alignment_group, VCF_DATATYPE)

    return {
        'user': user,
        'project': project,
        'reference_genome': reference_genome,
        'chromosome': chromosome,
        'samples': samples,
        'alignment_group': alignment_group
    }
示例#26
0
    def test_end_to_end(self):
        """Test running full pipline on small-ish data.

        The data file consists of 20,000 bases. At 5,000 bases there is
        a 400 base deletion. At 10,000 bases there is a 400 base inversion.
        At 15,000 bases there is a 400 base tandem duplication.

        It seems that Pindel cannot find the inversion. Fortunately,
        delly can usually find inversions. Unfortunately, delly only
        works well on large data, so we will not test it here.
        """
        # Create a new alignment group.
        alignment_group = AlignmentGroup.objects.create(
            label='test alignment', reference_genome=self.reference_genome)

        # Create a sample.
        sample_1 = ExperimentSample.objects.create(uid=TEST_SAMPLE_UID,
                                                   project=self.project,
                                                   label='sample1')
        ### Add the raw reads
        copy_and_add_dataset_source(sample_1, Dataset.TYPE.FASTQ1,
                                    Dataset.TYPE.FASTQ1, TEST_FASTQ1)
        copy_and_add_dataset_source(sample_1, Dataset.TYPE.FASTQ2,
                                    Dataset.TYPE.FASTQ2, TEST_FASTQ2)

        # Create relationship between alignment and sample.
        sample_alignment = ExperimentSampleToAlignment.objects.create(
            alignment_group=alignment_group, experiment_sample=sample_1)
        ### Add alignment data. NOTE: Stored in sample model dir.

        # index (no dataset)
        copy_dataset_to_entity_data_dir(sample_1, TEST_BAM_INDEX)

        # bam file (with dataset)
        copy_dest = copy_dataset_to_entity_data_dir(sample_1, TEST_BAM)
        add_dataset_to_entity(sample_alignment, Dataset.TYPE.BWA_ALIGN,
                              Dataset.TYPE.BWA_ALIGN, copy_dest)

        # Make sure there are no variants before.
        self.assertEqual(
            0,
            len(Variant.objects.filter(
                reference_genome=self.reference_genome)))

        # Test with Pindel only for now.
        for tool in ['pindel']:
            find_variants_with_tool(alignment_group,
                                    VARIANT_TOOL_PARAMS_MAP[tool],
                                    project=self.project)

        # Check that the alignment group has a freebayes vcf dataset associated
        # with it.
        vcf_dataset = get_dataset_with_type(alignment_group,
                                            Dataset.TYPE.VCF_PINDEL)
        self.assertIsNotNone(vcf_dataset)

        # Make sure the .vcf file actually exists.
        self.assertTrue(os.path.exists(vcf_dataset.get_absolute_location()))

        # Make sure the vcf is valid by reading it using pyvcf.
        with open(vcf_dataset.get_absolute_location()) as vcf_fh:
            try:
                reader = vcf.Reader(vcf_fh)
                reader.next()
            except:
                self.fail("Not valid vcf")

        # Grab the resulting variants.
        variants = Variant.objects.filter(
            reference_genome=self.reference_genome)

        # Confirm that 2 variants found.
        self.assertEqual(2, len(variants))

        variant_map = {}
        for variant in variants:
            variant_alternates = VariantAlternate.objects.filter(
                variant=variant)

            # There should be only one variant alternate per SV.
            self.assertEqual(len(variant_alternates), 1)

            pos = variant.position
            svtype = variant_alternates[0].data['INFO_SVTYPE']
            svlen = variant_alternates[0].data['INFO_SVLEN']
            variant_map[svtype] = (pos, svlen)

        # Check that there is a deletion around base 5000.
        self.assertTrue('DEL' in variant_map)
        self.assertTrue(abs(variant_map['DEL'][0] - 5000) <= 3)
        self.assertTrue(abs(variant_map['DEL'][1] - 400) <= 3)

        # Check that there is a tandem duplication around base 15000.
        self.assertTrue('DUP:TANDEM' in variant_map)
        self.assertTrue(abs(variant_map['DUP:TANDEM'][0] - 15000) <= 3)
        self.assertTrue(abs(variant_map['DUP:TANDEM'][1] - 400) <= 3)
示例#27
0
    def test_parser(self):
        """Basic tests for the parser.
        """
        VCF_DATATYPE = Dataset.TYPE.VCF_FREEBAYES
        alignment_group = AlignmentGroup.objects.create(
                label='test alignment', reference_genome=self.reference_genome)
        copy_and_add_dataset_source(alignment_group, VCF_DATATYPE,
                VCF_DATATYPE, TEST_GENOME_SNPS)

        Chromosome.objects.create(
            reference_genome=self.reference_genome,
            label='Chromosome',
            num_bases=9001)

        # Create experiment sample objects having UIDs that correspond to those
        # in the vcf file. This is a bit "fake" in that the actual pipeline we
        # will be generating the vcf file from the samples (see add_groups()
        # stage of pipeline.
        with open(TEST_GENOME_SNPS) as fh:
            reader = vcf.Reader(fh)
            experiment_sample_uids = reader.samples
        num_experiment_samples = len(experiment_sample_uids)
        for sample_uid in experiment_sample_uids:
            ExperimentSample.objects.create(
                uid=sample_uid,
                project=self.project,
                label='fakename:' + sample_uid
            )

        # Count the number of records in the vcf file for testing.
        record_count = 0
        with open(TEST_GENOME_SNPS) as fh:
            for record in vcf.Reader(fh):
                record_count += 1

        # Parse the vcf
        parse_alignment_group_vcf(alignment_group, VCF_DATATYPE)


        variant_list = Variant.objects.filter(
                reference_genome=self.reference_genome)

        # There should be one Variant object for each record.
        self.assertEqual(record_count, len(variant_list))

        # Spot-check a few variants.
        self.assertEqual(1, len(Variant.objects.filter(
                reference_genome=self.reference_genome,
                position=376)))

        v_453 = Variant.objects.get(reference_genome=self.reference_genome,
                position=453)
        self.assertEqual(['G'], v_453.get_alternates())

        # Check false negatives.
        self.assertEqual(0, len(Variant.objects.filter(
                reference_genome=self.reference_genome,
                position=454)))

        # There should be one VariantCallerCommonData object for each record.
        self.assertEqual(record_count,
                len(VariantCallerCommonData.objects.filter(
                        variant__reference_genome=self.reference_genome)))

        # There should also be one VariantEvidence object per Variant x Sample.
        for variant in variant_list:
            vccd = variant.variantcallercommondata_set.all()[0]
            self.assertEqual(num_experiment_samples,
                    len(vccd.variantevidence_set.all()))

        # Check that alternate data is populated.
        #Chromosome  1330    .   CG  C,GC,AG 126.036 .   AB=0.5,0.5,1;ABP=3.0103,3.0103,7.35324;AC=1,1,1;AF=0.0833333,0.0833333,0.0833333;AN=12;AO=1,1,2;CIGAR=1M1D,2X,1X1M;DP=10;DPRA=1.33333,1.33333,1.33333;EPP=5.18177,5.18177,3.0103;EPPR=4.45795;HWE=-16.5861;LEN=1,2,1;MEANALT=2,2,1;MQM=60,37,48.5;MQMR=40.8333;NS=6;NUMALT=3;ODDS=1.50408;PAIRED=1,0,0.5;PAIREDR=0.166667;RO=6;RPP=5.18177,5.18177,7.35324;RPPR=16.0391;RUN=1,1,1;SAP=5.18177,5.18177,3.0103;SRP=4.45795;TYPE=del,mnp,snp;XAI=0,0.0102041,0.00515464;XAM=0,0.0102041,0.0253649;XAS=0,0,0.0202103;XRI=0.0016835;XRM=0.00835084;XRS=0.00666733;technology.illumina=1,1,1;BVAR GT:DP:RO:QR:AO:QA:GL    .   0/0:1:1:36:0,0,0:0,0,0:0,-0.30103,-3.6,-0.30103,-3.6,-3.6,-0.30103,-3.6,-3.6,-3.6   0/0:2:2:76:0,0,0:0,0,0:0,-0.60206,-7.03,-0.60206,-7.03,-7.03,-0.60206,-7.03,-7.03,-7.03 1/2:2:0:0:1,1,0:108,31,0:-8.645,-3.40103,-3.1,-6.30103,-0.30103,-6,-8.645,-3.40103,-6.30103,-8.645  .   0/3:2:0:0:0,0,2:0,0,73:-6.935,-6.935,-6.935,-6.935,-6.935,-6.935,-0.60206,-0.60206,-0.60206,0   0/0:2:2:72:0,0,0:0,0,0:0,-0.60206,-6.84,-0.60206,-6.84,-6.84,-0.60206,-6.84,-6.84,-6.84 .   0/0:1:1:34:0,0,0:0,0,0:0,-0.30103,-3.4,-0.30103,-3.4,-3.4,-0.30103,-3.4,-3.4,-3.4   .
        v_1330 = Variant.objects.get(reference_genome=self.reference_genome,
                position=1330)
        self.assertEqual(set(v_1330.get_alternates()), set(['C', 'GC', 'AG']))
        v_1330_c = VariantAlternate.objects.get(variant=v_1330, alt_value='C')
        self.assertTrue(len(v_1330_c.variantevidence_set.all()))
        v_1330_gc = VariantAlternate.objects.get(variant=v_1330, alt_value='GC')
        self.assertTrue(len(v_1330_gc.variantevidence_set.all()))
        self.assertEqual(v_1330_c.data['INFO_ABP'], v_1330_gc.data['INFO_ABP'])
def bootstrap_fake_data():
    """Fill the database with fake data.
    """
    user = get_or_create_user()

    ### Create some projects
    (test_project, project_created) = Project.objects.get_or_create(
            title=TEST_PROJECT_NAME, owner=user.get_profile())
    (test_project_2, project_created) = Project.objects.get_or_create(
            title=SV_PROJECT_NAME, owner=user.get_profile())

    ### Create some reference genomes
    ref_genome_1 = import_reference_genome_from_local_file(
            test_project, REF_GENOME_1_LABEL, TEST_FASTA, 'fasta')

    ref_genome_2 = import_reference_genome_from_local_file(
            test_project, REF_GENOME_2_LABEL, TEST_FASTA, 'fasta')

    ref_genome_3 = import_reference_genome_from_local_file(
            test_project, 'test_genome', TEST_FASTA, 'fasta')

    ### Create some saved queries.
    for saved_query_text in CUSTOM_SAVED_QUERY_LIST:
        SavedVariantFilterQuery.objects.get_or_create(
                owner=user.get_profile(),
                text=saved_query_text)

    ### Create some ExperimentSamples.

    # Create some samples without backing data just to explore the UI.
    ExperimentSample.objects.create(
            project=test_project,
            label='C321D_MiSeq',
            data = {'SAMPLE_WELL': 'A01'}
    )

    ExperimentSample.objects.create(
            project=test_project,
            label='C321D Fixed 01',
            data = {'SAMPLE_WELL': 'A02'}
    )

    ExperimentSample.objects.create(
            project=test_project,
            label='C321D Fixed 02',
            data = {'SAMPLE_WELL': 'A03'}
    )

    # Create some samples with backing data.
    (sample_1, created) = ExperimentSample.objects.get_or_create(
            project=test_project,
            label=SAMPLE_1_LABEL)
    # Add datasets to the samples.
    if not sample_1.dataset_set.filter(type=Dataset.TYPE.FASTQ1):
        copy_and_add_dataset_source(sample_1, Dataset.TYPE.FASTQ1,
                Dataset.TYPE.FASTQ1, TEST_FASTQ1)
    if not sample_1.dataset_set.filter(type=Dataset.TYPE.FASTQ2):
        copy_and_add_dataset_source(sample_1, Dataset.TYPE.FASTQ2,
                Dataset.TYPE.FASTQ2, TEST_FASTQ2)

    # Create sample backed by g-zipped data.
    gz_backed_sample = ExperimentSample.objects.create(
            project=test_project,
            label='sample backed by gz data')
    gz_fastq1_dataset = copy_and_add_dataset_source(
            gz_backed_sample, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ1,
            TEST_FASTQ_GZ_1)
    gz_fastq2_dataset = copy_and_add_dataset_source(
            gz_backed_sample, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ2,
            TEST_FASTQ_GZ_2)
    run_fastqc_on_sample_fastq(gz_backed_sample, gz_fastq1_dataset)
    run_fastqc_on_sample_fastq(gz_backed_sample, gz_fastq2_dataset, rev=True)

    ### Create an alignment.
    alignment_group_1 = AlignmentGroup.objects.create(
            label='Alignment 1',
            reference_genome=ref_genome_3,
            aligner=AlignmentGroup.ALIGNER.BWA)
    # Link it to a sample.
    sample_alignment = ExperimentSampleToAlignment.objects.create(
            alignment_group=alignment_group_1,
            experiment_sample=sample_1)
    ### Add alignment data. NOTE: Stored in sample model dir.
    # NOTE: This is a bit convoluted. Perhaps it would be better to store alignments
    # in the ExperimentSampleToAlignment directory.
    copy_dest = copy_dataset_to_entity_data_dir(sample_1, TEST_BAM)
    copy_dataset_to_entity_data_dir(sample_1, TEST_BAM_INDEX)
    add_dataset_to_entity(sample_alignment, Dataset.TYPE.BWA_ALIGN,
            Dataset.TYPE.BWA_ALIGN, copy_dest)

    # Create fake variants.
    create_fake_variants_and_variant_sets(ref_genome_1)

    #############################
    # Full VCF Testing (annotated for snpeff, variant filtering, etc)
    #############################

    # Create a new reference genome and samples using full_vcf_test_set
    full_vcf_reference_genome = import_reference_genome_from_local_file(
                test_project, 'mg1655_tolC_through_zupT',
                FullVCFTestSet.TEST_GENBANK, 'genbank')

    # Create all samples.
    parent_obj = None
    full_vcf_samples = []
    for i in range(FullVCFTestSet.NUM_SAMPLES):
        sample_obj = ExperimentSample.objects.create(
                project=test_project,
                label='Sample %d' % i)

        sample_obj.data['SAMPLE_WELL'] = 'A0%d' % (i+1)

        if i == 0:
            parent_obj = sample_obj
        if i > 0:
            sample_obj.data['SAMPLE_PARENTS'] = parent_obj.label
            parent_obj.add_child(sample_obj)
            parent_obj.save()

        sample_obj.save()

        # Add raw reads to each sample.
        fastq1_dataset = copy_and_add_dataset_source(sample_obj,
                Dataset.TYPE.FASTQ1,
                Dataset.TYPE.FASTQ1,
                FullVCFTestSet.FASTQ1[i])
        fastq2_dataset = copy_and_add_dataset_source(sample_obj,
                Dataset.TYPE.FASTQ2,
                Dataset.TYPE.FASTQ2,
                FullVCFTestSet.FASTQ2[i])

        # Run FASTQC on sample reads.
        run_fastqc_on_sample_fastq(sample_obj, fastq1_dataset)
        run_fastqc_on_sample_fastq(sample_obj, fastq2_dataset, rev=True)

        full_vcf_samples.append(sample_obj)

    # Run the alignment. Return the alignment group created, indexed by the
    # reference genome's uid.
    (full_vcf_alignment_group, pipeline_async_result) = run_pipeline(
            'test_align', full_vcf_reference_genome, full_vcf_samples)

    import_variant_set_from_vcf(full_vcf_reference_genome, 'Designed',
            FullVCFTestSet.TEST_DESIGNED_SNPS)

    def _create_region_intervals(region, interval_tuple_list):
        """Helper method to create RegionIntervals for a Region.

        Args:
            region: Region Model object.
            interval_tuple_list: List of tuples of intervals to create.
        """
        for interval in interval_tuple_list:
            RegionInterval.objects.create(
                    region=region,
                    start=interval[0],
                    end=interval[1])

    # Create some fake regions.
    # TODO: Should not be much harder to replace this with real regions.
    region_1 = Region.objects.create(
        reference_genome=full_vcf_reference_genome,
        label='region_1',
        type=Region.TYPE.POOR_MAPPING_QUALITY)
    _create_region_intervals(region_1, [(1,150), (300, 400), (500, 900)])

    region_2 = Region.objects.create(
        reference_genome=full_vcf_reference_genome,
        label='region_2',
        type=Region.TYPE.POOR_MAPPING_QUALITY)
    _create_region_intervals(region_2, [(1000, 1500)])

    region_3 = Region.objects.create(
        reference_genome=full_vcf_reference_genome,
        label='region_3',
        type=Region.TYPE.POOR_MAPPING_QUALITY)
    _create_region_intervals(region_3, [(1800, 1900), (2150, 2300)])

    # And some GENE regions.

    gene_A = Region.objects.create(
        reference_genome=full_vcf_reference_genome,
        label='geneA',
        type=Region.TYPE.GENE)
    _create_region_intervals(gene_A, [(2000, 2400)])

    gene_B = Region.objects.create(
        reference_genome=full_vcf_reference_genome,
        label='geneB',
        type=Region.TYPE.GENE)
    _create_region_intervals(gene_B, [(4800, 5200)])

    gene_C = Region.objects.create(
        reference_genome=full_vcf_reference_genome,
        label='geneC',
        type=Region.TYPE.GENE)
    _create_region_intervals(gene_C, [(1, 500)])

    # Bootstrap test_project_2 with SV stuff
    sv_testing_bootstrap(test_project_2)
示例#29
0
    def test_end_to_end(self):
        """Test running full pipline on small-ish data.

        The data file consists of 20,000 bases. At 5,000 bases there is
        a 400 base deletion. At 10,000 bases there is a 400 base inversion.
        At 15,000 bases there is a 400 base tandem duplication.

        It seems that Pindel cannot find the inversion. Fortunately,
        delly can usually find inversions. Unfortunately, delly only
        works well on large data, so we will not test it here.
        """
        # Create a new alignment group.
        alignment_group = AlignmentGroup.objects.create(
                label='test alignment', reference_genome=self.reference_genome)

        # Create a sample.
        sample_1 = ExperimentSample.objects.create(
                uid=TEST_SAMPLE_UID,
                project=self.project,
                label='sample1')
        ### Add the raw reads
        copy_and_add_dataset_source(sample_1, Dataset.TYPE.FASTQ1,
                Dataset.TYPE.FASTQ1, TEST_FASTQ1)
        copy_and_add_dataset_source(sample_1, Dataset.TYPE.FASTQ2,
                Dataset.TYPE.FASTQ2, TEST_FASTQ2)

        # Create relationship between alignment and sample.
        sample_alignment = ExperimentSampleToAlignment.objects.create(
                alignment_group=alignment_group,
                experiment_sample=sample_1)
        ### Add alignment data. NOTE: Stored in sample model dir.

        # index (no dataset)
        copy_dataset_to_entity_data_dir(sample_1, TEST_BAM_INDEX)

        # bam file (with dataset)
        copy_dest = copy_dataset_to_entity_data_dir(sample_1, TEST_BAM)
        add_dataset_to_entity(sample_alignment, Dataset.TYPE.BWA_ALIGN,
                Dataset.TYPE.BWA_ALIGN, copy_dest)

        # Make sure there are no variants before.
        self.assertEqual(0, len(Variant.objects.filter(
                reference_genome=self.reference_genome)))

        # Test with Pindel only for now.
        for tool in ['pindel']:
            find_variants_with_tool(alignment_group,
                    VARIANT_TOOL_PARAMS_MAP[tool], project=self.project)

        # Check that the alignment group has a freebayes vcf dataset associated
        # with it.
        vcf_dataset = get_dataset_with_type(alignment_group,
                Dataset.TYPE.VCF_PINDEL)
        self.assertIsNotNone(vcf_dataset)

        # Make sure the .vcf file actually exists.
        self.assertTrue(os.path.exists(vcf_dataset.get_absolute_location()))

        # Make sure the vcf is valid by reading it using pyvcf.
        with open(vcf_dataset.get_absolute_location()) as vcf_fh:
            try:
                reader = vcf.Reader(vcf_fh)
                reader.next()
            except:
                self.fail("Not valid vcf")

        # Grab the resulting variants.
        variants = Variant.objects.filter(reference_genome=self.reference_genome)

        # Confirm that 2 variants found.
        self.assertEqual(2, len(variants))

        variant_map = {}
        for variant in variants:
            variant_alternates = VariantAlternate.objects.filter(variant=variant)

            # There should be only one variant alternate per SV.
            self.assertEqual(len(variant_alternates), 1)

            pos = variant.position
            svtype = variant_alternates[0].data['INFO_SVTYPE']
            svlen = variant_alternates[0].data['INFO_SVLEN']
            variant_map[svtype] = (pos, svlen)

        # Check that there is a deletion around base 5000.
        self.assertTrue('DEL' in variant_map)
        self.assertTrue(abs(variant_map['DEL'][0] - 5000) <= 3)
        self.assertTrue(abs(variant_map['DEL'][1] - 400) <= 3)

        # Check that there is a tandem duplication around base 15000.
        self.assertTrue('DUP:TANDEM' in variant_map)
        self.assertTrue(abs(variant_map['DUP:TANDEM'][0] - 15000) <= 3)
        self.assertTrue(abs(variant_map['DUP:TANDEM'][1] - 400) <= 3)
示例#30
0
def bootstrap_fake_data():
    """Fill the database with fake data.
    """
    user = get_or_create_user()

    ### Create some projects
    (test_project,
     project_created) = Project.objects.get_or_create(title=TEST_PROJECT_NAME,
                                                      owner=user.get_profile())
    (test_project_2,
     project_created) = Project.objects.get_or_create(title=SV_PROJECT_NAME,
                                                      owner=user.get_profile())

    ### Create some reference genomes
    ref_genome_1 = import_reference_genome_from_local_file(
        test_project, REF_GENOME_1_LABEL, TEST_FASTA, 'fasta')

    ref_genome_2 = import_reference_genome_from_local_file(
        test_project, REF_GENOME_2_LABEL, TEST_FASTA, 'fasta')

    ref_genome_3 = import_reference_genome_from_local_file(
        test_project, 'test_genome', TEST_FASTA, 'fasta')

    ### Create some saved queries.
    for saved_query_text in CUSTOM_SAVED_QUERY_LIST:
        SavedVariantFilterQuery.objects.get_or_create(owner=user.get_profile(),
                                                      text=saved_query_text)

    ### Create some ExperimentSamples.

    # Create some samples without backing data just to explore the UI.
    ExperimentSample.objects.create(project=test_project,
                                    label='C321D_MiSeq',
                                    data={'SAMPLE_WELL': 'A01'})

    ExperimentSample.objects.create(project=test_project,
                                    label='C321D Fixed 01',
                                    data={'SAMPLE_WELL': 'A02'})

    ExperimentSample.objects.create(project=test_project,
                                    label='C321D Fixed 02',
                                    data={'SAMPLE_WELL': 'A03'})

    # Create some samples with backing data.
    (sample_1,
     created) = ExperimentSample.objects.get_or_create(project=test_project,
                                                       label=SAMPLE_1_LABEL)
    # Add datasets to the samples.
    if not sample_1.dataset_set.filter(type=Dataset.TYPE.FASTQ1):
        copy_and_add_dataset_source(sample_1, Dataset.TYPE.FASTQ1,
                                    Dataset.TYPE.FASTQ1, TEST_FASTQ1)
    if not sample_1.dataset_set.filter(type=Dataset.TYPE.FASTQ2):
        copy_and_add_dataset_source(sample_1, Dataset.TYPE.FASTQ2,
                                    Dataset.TYPE.FASTQ2, TEST_FASTQ2)

    # Create sample backed by g-zipped data.
    gz_backed_sample = ExperimentSample.objects.create(
        project=test_project, label='sample backed by gz data')
    gz_fastq1_dataset = copy_and_add_dataset_source(gz_backed_sample,
                                                    Dataset.TYPE.FASTQ1,
                                                    Dataset.TYPE.FASTQ1,
                                                    TEST_FASTQ_GZ_1)
    gz_fastq2_dataset = copy_and_add_dataset_source(gz_backed_sample,
                                                    Dataset.TYPE.FASTQ1,
                                                    Dataset.TYPE.FASTQ2,
                                                    TEST_FASTQ_GZ_2)
    run_fastqc_on_sample_fastq(gz_backed_sample, gz_fastq1_dataset)
    run_fastqc_on_sample_fastq(gz_backed_sample, gz_fastq2_dataset, rev=True)

    ### Create an alignment.
    alignment_group_1 = AlignmentGroup.objects.create(
        label='Alignment 1',
        reference_genome=ref_genome_3,
        aligner=AlignmentGroup.ALIGNER.BWA)
    # Link it to a sample.
    sample_alignment = ExperimentSampleToAlignment.objects.create(
        alignment_group=alignment_group_1, experiment_sample=sample_1)
    ### Add alignment data. NOTE: Stored in sample model dir.
    # NOTE: This is a bit convoluted. Perhaps it would be better to store alignments
    # in the ExperimentSampleToAlignment directory.
    copy_dest = copy_dataset_to_entity_data_dir(sample_1, TEST_BAM)
    copy_dataset_to_entity_data_dir(sample_1, TEST_BAM_INDEX)
    add_dataset_to_entity(sample_alignment, Dataset.TYPE.BWA_ALIGN,
                          Dataset.TYPE.BWA_ALIGN, copy_dest)

    # Create fake variants.
    create_fake_variants_and_variant_sets(ref_genome_1)

    #############################
    # Full VCF Testing (annotated for snpeff, variant filtering, etc)
    #############################

    # Create a new reference genome and samples using full_vcf_test_set
    full_vcf_reference_genome = import_reference_genome_from_local_file(
        test_project, 'mg1655_tolC_through_zupT', FullVCFTestSet.TEST_GENBANK,
        'genbank')

    # Create all samples.
    parent_obj = None
    full_vcf_samples = []
    for i in range(FullVCFTestSet.NUM_SAMPLES):
        sample_obj = ExperimentSample.objects.create(project=test_project,
                                                     label='Sample %d' % i)

        sample_obj.data['SAMPLE_WELL'] = 'A0%d' % (i + 1)

        if i == 0:
            parent_obj = sample_obj
        if i > 0:
            sample_obj.data['SAMPLE_PARENTS'] = parent_obj.label
            parent_obj.add_child(sample_obj)
            parent_obj.save()

        sample_obj.save()

        # Add raw reads to each sample.
        fastq1_dataset = copy_and_add_dataset_source(sample_obj,
                                                     Dataset.TYPE.FASTQ1,
                                                     Dataset.TYPE.FASTQ1,
                                                     FullVCFTestSet.FASTQ1[i])
        fastq2_dataset = copy_and_add_dataset_source(sample_obj,
                                                     Dataset.TYPE.FASTQ2,
                                                     Dataset.TYPE.FASTQ2,
                                                     FullVCFTestSet.FASTQ2[i])

        # Run FASTQC on sample reads.
        run_fastqc_on_sample_fastq(sample_obj, fastq1_dataset)
        run_fastqc_on_sample_fastq(sample_obj, fastq2_dataset, rev=True)

        full_vcf_samples.append(sample_obj)

    # Run the alignment. Return the alignment group created, indexed by the
    # reference genome's uid.
    run_pipeline('test_align', full_vcf_reference_genome, full_vcf_samples)

    import_variant_set_from_vcf(full_vcf_reference_genome, 'Designed',
                                FullVCFTestSet.TEST_DESIGNED_SNPS)

    def _create_region_intervals(region, interval_tuple_list):
        """Helper method to create RegionIntervals for a Region.

        Args:
            region: Region Model object.
            interval_tuple_list: List of tuples of intervals to create.
        """
        for interval in interval_tuple_list:
            RegionInterval.objects.create(region=region,
                                          start=interval[0],
                                          end=interval[1])

    # Create some fake regions.
    # TODO: Should not be much harder to replace this with real regions.
    region_1 = Region.objects.create(
        reference_genome=full_vcf_reference_genome,
        label='region_1',
        type=Region.TYPE.POOR_MAPPING_QUALITY)
    _create_region_intervals(region_1, [(1, 150), (300, 400), (500, 900)])

    region_2 = Region.objects.create(
        reference_genome=full_vcf_reference_genome,
        label='region_2',
        type=Region.TYPE.POOR_MAPPING_QUALITY)
    _create_region_intervals(region_2, [(1000, 1500)])

    region_3 = Region.objects.create(
        reference_genome=full_vcf_reference_genome,
        label='region_3',
        type=Region.TYPE.POOR_MAPPING_QUALITY)
    _create_region_intervals(region_3, [(1800, 1900), (2150, 2300)])

    # And some GENE regions.

    gene_A = Region.objects.create(reference_genome=full_vcf_reference_genome,
                                   label='geneA',
                                   type=Region.TYPE.GENE)
    _create_region_intervals(gene_A, [(2000, 2400)])

    gene_B = Region.objects.create(reference_genome=full_vcf_reference_genome,
                                   label='geneB',
                                   type=Region.TYPE.GENE)
    _create_region_intervals(gene_B, [(4800, 5200)])

    gene_C = Region.objects.create(reference_genome=full_vcf_reference_genome,
                                   label='geneC',
                                   type=Region.TYPE.GENE)
    _create_region_intervals(gene_C, [(1, 500)])

    # Bootstrap test_project_2 with SV stuff
    sv_testing_bootstrap(test_project_2)