예제 #1
0
    def test_post_add_seq_to_ref_genome(self):
        """
        Ensure that everything gets converted after creating a new reference
        genome object, like snpeff, fasta, gff, etc.
        """

        # SNPEFF

        # Make sure Genbank file exists where expected.
        gbk_path = self.test_ext_ref_genome.get_snpeff_genbank_file_path()
        self.assertTrue(os.path.exists(gbk_path),
                'snpeff gbk conversion failed: %s' % gbk_path)

        # check that the db was made
        snpEffPredictor_bin_path = os.path.join(
                self.test_ext_ref_genome.get_snpeff_genbank_parent_dir(),
                'snpEffectPredictor.bin')
        self.assertTrue(os.path.exists(snpEffPredictor_bin_path),
                'snpeff db was not made')

        # FASTA
        fasta = get_dataset_with_type(self.test_ext_ref_genome,
                type=Dataset.TYPE.REFERENCE_GENOME_FASTA)
        assert os.path.exists(fasta.get_absolute_location()), (
                'fasta conversion failed')

        # GFF
        gff = get_dataset_with_type(self.test_ext_ref_genome,
                type=Dataset.TYPE.REFERENCE_GENOME_GFF)
        assert os.path.exists(gff.get_absolute_location()), (
                'gff conversion failed')
예제 #2
0
    def test_post_add_seq_to_ref_genome(self):
        """
        Ensure that everything gets converted after creating a new reference
        genome object, like snpeff, fasta, gff, etc.
        """

        # SNPEFF

        # check that the genbank file was symlinked
        gbk_path = os.path.join(
                self.test_ext_ref_genome.get_snpeff_directory_path(),
                'genes.gb')
        assert os.path.exists(gbk_path), 'snpeff gbk conversion failed: %s' % (
                gbk_path)

        # check that the db was made
        assert os.path.exists(os.path.join(
                self.test_ext_ref_genome.get_snpeff_directory_path(),
                'snpEffectPredictor.bin')), 'snpeff db was not made'

        # FASTA
        fasta = get_dataset_with_type(self.test_ext_ref_genome,
                type=Dataset.TYPE.REFERENCE_GENOME_FASTA)
        assert os.path.exists(fasta.get_absolute_location()), (
                'fasta conversion failed')

        # GFF
        gff = get_dataset_with_type(self.test_ext_ref_genome,
                type=Dataset.TYPE.REFERENCE_GENOME_GFF)
        assert os.path.exists(gff.get_absolute_location()), (
                'gff conversion failed')
예제 #3
0
    def test_compress_dataset(self):
        """
        Make sure that compressing a dataset and putting a new dataset
        entry into the db works correctly.
        """
        user = User.objects.create_user(TEST_USERNAME,
                                        password=TEST_PASSWORD,
                                        email=TEST_EMAIL)

        self.test_project = Project.objects.create(title=TEST_PROJECT_NAME,
                                                   owner=user.get_profile())

        self.test_ref_genome = import_reference_genome_from_local_file(
            self.test_project, TEST_REF_GENOME_NAME, TEST_REF_GENOME_PATH,
            'genbank')

        dataset = get_dataset_with_type(
            self.test_ref_genome, type=Dataset.TYPE.REFERENCE_GENOME_GENBANK)

        # All the magic happens here
        compressed_dataset = dataset.make_compressed('.gz')

        # Grab the new compressed dataset through the ref genome to
        # make sure that it got added
        compressed_dataset_through_ref_genome = get_dataset_with_type(
            entity=self.test_ref_genome,
            type=Dataset.TYPE.REFERENCE_GENOME_GENBANK,
            compressed=True)
        assert compressed_dataset == compressed_dataset_through_ref_genome
예제 #4
0
    def test_compress_dataset(self):
        """
        Make sure that compressing a dataset and putting a new dataset
        entry into the db works correctly.
        """
        user = User.objects.create_user(TEST_USERNAME, password=TEST_PASSWORD,
                email=TEST_EMAIL)

        self.test_project = Project.objects.create(
            title=TEST_PROJECT_NAME,
            owner=user.get_profile())

        self.test_ref_genome = import_reference_genome_from_local_file(
            self.test_project,
            TEST_REF_GENOME_NAME,
            TEST_REF_GENOME_PATH,
            'genbank')

        dataset = get_dataset_with_type(self.test_ref_genome,
                type= Dataset.TYPE.REFERENCE_GENOME_GENBANK)

        # All the magic happens here
        compressed_dataset = dataset.make_compressed('.gz')

        # Grab the new compressed dataset through the ref genome to
        # make sure that it got added
        compressed_dataset_through_ref_genome = get_dataset_with_type(
                entity= self.test_ref_genome,
                type= Dataset.TYPE.REFERENCE_GENOME_GENBANK,
                compressed= True)
        assert compressed_dataset == compressed_dataset_through_ref_genome
예제 #5
0
def add_genbank_file_track(reference_genome, **kwargs):
    """
    Jbrowse has the ability to make tracks out of genbank files. This
    takes the genbank file from a reference_genome object and attempts to
    make such a track and then add it to the track list.
    """
    FLATFILE_TRACK_BIN = os.path.join(JBROWSE_BIN_PATH, 'flatfile-to-json.pl')

    reference_gbk = get_dataset_with_type(
        reference_genome,
        type=Dataset.TYPE.REFERENCE_GENOME_GENBANK).get_absolute_location()

    jbrowse_path = reference_genome.get_jbrowse_directory_path()

    reference_gff = get_dataset_with_type(
        reference_genome,
        type=Dataset.TYPE.REFERENCE_GENOME_GFF).get_absolute_location()

    json_update_fields = {
        'style': {
            'label': 'name,CDS,gene',
            'description': 'note,function,gene_synonym',
            'color': '#5fbcdd'
        }
    }

    genbank_json_command = [
        FLATFILE_TRACK_BIN,
        '--gff',
        reference_gff,
        '--out',
        os.path.join(jbrowse_path, 'indiv_tracks', 'gbk'),
        '--type',
        JBROWSE_GBK_TYPES_TO_DISPLAY,
        '--autocomplete',
        'all',
        '--trackLabel',
        'gbk',
        '--key',
        "Genome Features",
        '--trackType',
        "CanvasFeatures",
        #'--getSubfeatures',
        #'--className','transcript',
        #'--subfeatureClasses', "{\"CDS\":\"transcript-CDS\"}"
    ]

    subprocess.check_call(genbank_json_command)

    # Finally, manually update tracklist json with style info
    tracklist_json = get_tracklist_json(reference_genome, 'gbk')

    for i, track in enumerate(tracklist_json['tracks']):
        if track['key'] == 'Genome Features':
            tracklist_json['tracks'][i] = merge_nested_dictionaries(
                track, json_update_fields)

    write_tracklist_json(reference_genome, tracklist_json, 'gbk')
    def test_basic(self):
        """Basic test.
        """
        self.reference_genome = import_reference_genome_from_local_file(
                self.project, 'ref_genome', TEST_GENBANK, 'genbank')
        variant_set = VariantSet.objects.create(
                reference_genome=self.reference_genome,
                label='vs1')

        ref_genome_filepath = get_dataset_with_type(self.reference_genome,
                Dataset.TYPE.REFERENCE_GENOME_GENBANK).get_absolute_location()

        with open(ref_genome_filepath) as fh:
            ref_genome_seq_record = SeqIO.read(fh, 'genbank')

        for position in range(10, 111, 10):
            ref_value = ref_genome_seq_record[position - 1]
            var = Variant.objects.create(
                    type=Variant.TYPE.TRANSITION,
                    reference_genome=self.reference_genome,
                    chromosome=Chromosome.objects.get(reference_genome=self.reference_genome),
                    position=position,
                    ref_value=ref_value)

            VariantAlternate.objects.create(
                    variant=var, alt_value='G')

            VariantToVariantSet.objects.create(
                    variant=var, variant_set=variant_set)

        new_ref_genome_params = {
            'label': 'new'
        }

        new_ref_genome = generate_new_reference_genome(
                variant_set, new_ref_genome_params)

        new_ref_genome_filepath = get_dataset_with_type(
                        new_ref_genome,
                        Dataset.TYPE.REFERENCE_GENOME_GENBANK)\
                .get_absolute_location()
        with open(new_ref_genome_filepath) as fh:
            new_ref_genome_seq_record = SeqIO.read(fh, 'genbank')

        # Assert size unchangd.
        self.assertEqual(len(new_ref_genome_seq_record),
                len(ref_genome_seq_record))

        # Assert mutations are there.
        for position in range(10, 111, 10):
            self.assertEqual('G', str(new_ref_genome_seq_record[position - 1]))

        # Assert new genome is annotated.
        self.assertTrue(new_ref_genome.is_annotated())
예제 #7
0
def get_split_reads(sample_alignment):
    """Isolate split reads from a sample alignment.

    This uses a python script supplied with Lumpy that is run as a
    separate process.

    NOTE THAT THIS SCRIPT ONLY WORKS WITH BWA MEM.
    """
    bwa_split_dataset = get_dataset_with_type(
            sample_alignment, Dataset.TYPE.BWA_SPLIT)
    if bwa_split_dataset is not None:
        if (bwa_split_dataset.status == Dataset.STATUS.READY and
                os.path.exists(bwa_split_dataset.get_absolute_location())):
            return bwa_split_dataset
    else:
        bwa_split_dataset = Dataset.objects.create(
                label=Dataset.TYPE.BWA_SPLIT,
                type=Dataset.TYPE.BWA_SPLIT,
                status=Dataset.STATUS.NOT_STARTED)
        sample_alignment.dataset_set.add(bwa_split_dataset)

    # If here, we are going to run or re-run the Dataset.
    bwa_split_dataset.status = Dataset.STATUS.NOT_STARTED
    bwa_split_dataset.save(update_fields=['status'])

    bam_dataset = get_dataset_with_type(sample_alignment, Dataset.TYPE.BWA_ALIGN)
    bam_filename = bam_dataset.get_absolute_location()

    assert os.path.exists(bam_filename), "BAM file '%s' is missing." % (
            bam_filename)

    bam_split_filename = os.path.join(sample_alignment.get_model_data_dir(),
            'bwa_split_reads.bam')

    try:
        bwa_split_dataset.status = Dataset.STATUS.COMPUTING
        bwa_split_dataset.save(update_fields=['status'])
        extract_split_reads(bam_filename, bam_split_filename)

    except subprocess.CalledProcessError:
        # if there are no split reads, then fail.
        bwa_split_dataset.filesystem_location = ''
        bwa_split_dataset.status = Dataset.STATUS.FAILED
    finally:
        bwa_split_dataset.status = Dataset.STATUS.READY
        bwa_split_dataset.filesystem_location = clean_filesystem_location(
                bam_split_filename)


    bwa_split_dataset.save()

    return bwa_split_dataset
    def test_basic(self):
        """Basic test.
        """
        self.reference_genome = import_reference_genome_from_local_file(
            self.project, 'ref_genome', TEST_GENBANK, 'genbank')
        variant_set = VariantSet.objects.create(
            reference_genome=self.reference_genome, label='vs1')

        ref_genome_filepath = get_dataset_with_type(
            self.reference_genome,
            Dataset.TYPE.REFERENCE_GENOME_GENBANK).get_absolute_location()

        with open(ref_genome_filepath) as fh:
            ref_genome_seq_record = SeqIO.read(fh, 'genbank')

        for position in range(10, 111, 10):
            ref_value = ref_genome_seq_record[position - 1]
            var = Variant.objects.create(
                type=Variant.TYPE.TRANSITION,
                reference_genome=self.reference_genome,
                chromosome=Chromosome.objects.get(
                    reference_genome=self.reference_genome),
                position=position,
                ref_value=ref_value)

            VariantAlternate.objects.create(variant=var, alt_value='G')

            VariantToVariantSet.objects.create(variant=var,
                                               variant_set=variant_set)

        new_ref_genome_params = {'label': 'new'}

        new_ref_genome = generate_new_reference_genome(variant_set,
                                                       new_ref_genome_params)

        new_ref_genome_filepath = get_dataset_with_type(
                        new_ref_genome,
                        Dataset.TYPE.REFERENCE_GENOME_GENBANK)\
                .get_absolute_location()
        with open(new_ref_genome_filepath) as fh:
            new_ref_genome_seq_record = SeqIO.read(fh, 'genbank')

        # Assert size unchangd.
        self.assertEqual(len(new_ref_genome_seq_record),
                         len(ref_genome_seq_record))

        # Assert mutations are there.
        for position in range(10, 111, 10):
            self.assertEqual('G', str(new_ref_genome_seq_record[position - 1]))

        # Assert new genome is annotated.
        self.assertTrue(new_ref_genome.is_annotated())
예제 #9
0
def get_split_reads(sample_alignment):
    """Isolate split reads from a sample alignment.

    This uses a python script supplied with Lumpy that is run as a
    separate process.

    NOTE THAT THIS SCRIPT ONLY WORKS WITH BWA MEM.
    """
    bwa_split_dataset = get_dataset_with_type(sample_alignment,
                                              Dataset.TYPE.BWA_SPLIT)
    if bwa_split_dataset is not None:
        if (bwa_split_dataset.status == Dataset.STATUS.READY
                and os.path.exists(bwa_split_dataset.get_absolute_location())):
            return bwa_split_dataset
    else:
        bwa_split_dataset = Dataset.objects.create(
            label=Dataset.TYPE.BWA_SPLIT,
            type=Dataset.TYPE.BWA_SPLIT,
            status=Dataset.STATUS.NOT_STARTED)
        sample_alignment.dataset_set.add(bwa_split_dataset)

    # If here, we are going to run or re-run the Dataset.
    bwa_split_dataset.status = Dataset.STATUS.NOT_STARTED
    bwa_split_dataset.save(update_fields=['status'])

    bam_dataset = get_dataset_with_type(sample_alignment,
                                        Dataset.TYPE.BWA_ALIGN)
    bam_filename = bam_dataset.get_absolute_location()

    assert os.path.exists(
        bam_filename), "BAM file '%s' is missing." % (bam_filename)

    bam_split_filename = os.path.join(sample_alignment.get_model_data_dir(),
                                      'bwa_split_reads.bam')

    try:
        bwa_split_dataset.status = Dataset.STATUS.COMPUTING
        bwa_split_dataset.save(update_fields=['status'])
        extract_split_reads(bam_filename, bam_split_filename)

    except subprocess.CalledProcessError:
        # if there are no split reads, then fail.
        bwa_split_dataset.filesystem_location = ''
        bwa_split_dataset.status = Dataset.STATUS.FAILED
    finally:
        bwa_split_dataset.status = Dataset.STATUS.READY
        bwa_split_dataset.filesystem_location = clean_filesystem_location(
            bam_split_filename)

    bwa_split_dataset.save()

    return bwa_split_dataset
예제 #10
0
def get_discordant_read_pairs(sample_alignment):
    """Isolate discordant pairs of reads from a sample alignment.
    """
    # First, check if completed dataset already exists.
    bwa_disc_dataset = get_dataset_with_type(sample_alignment,
                                             Dataset.TYPE.BWA_DISCORDANT)
    if bwa_disc_dataset is not None:
        if (bwa_disc_dataset.status == Dataset.STATUS.READY
                and os.path.exists(bwa_disc_dataset.get_absolute_location())):
            return bwa_disc_dataset
    else:
        bwa_disc_dataset = Dataset.objects.create(
            label=Dataset.TYPE.BWA_DISCORDANT,
            type=Dataset.TYPE.BWA_DISCORDANT)
        sample_alignment.dataset_set.add(bwa_disc_dataset)

    # If here, we are going to run or re-run the Dataset.
    bwa_disc_dataset.status = Dataset.STATUS.NOT_STARTED
    bwa_disc_dataset.save(update_fields=['status'])

    bam_dataset = get_dataset_with_type(sample_alignment,
                                        Dataset.TYPE.BWA_ALIGN)
    bam_filename = bam_dataset.get_absolute_location()

    assert os.path.exists(
        bam_filename), "BAM file '%s' is missing." % (bam_filename)

    # NOTE: This assumes the index just adds at .bai, w/ same path otherwise
    # - will this always be true?
    if not os.path.exists(bam_filename + '.bai'):
        index_bam_file(bam_filename)

    bam_discordant_filename = os.path.join(
        sample_alignment.get_model_data_dir(), 'bwa_discordant_pairs.bam')

    try:
        bwa_disc_dataset.status = Dataset.STATUS.COMPUTING
        bwa_disc_dataset.save(update_fields=['status'])
        extract_discordant_read_pairs(bam_filename, bam_discordant_filename)

    except subprocess.CalledProcessError:
        bwa_disc_dataset.filesystem_location = ''
        bwa_disc_dataset.status = Dataset.STATUS.FAILED
    finally:
        bwa_disc_dataset.status = Dataset.STATUS.READY
        bwa_disc_dataset.filesystem_location = clean_filesystem_location(
            bam_discordant_filename)

    bwa_disc_dataset.save()

    return bwa_disc_dataset
예제 #11
0
def add_genbank_file_track(reference_genome, **kwargs):
    """
    Jbrowse has the ability to make tracks out of genbank files. This
    takes the genbank file from a reference_genome object and attempts to
    make such a track and then add it to the track list.
    """
    FLATFILE_TRACK_BIN = os.path.join(JBROWSE_BIN_PATH, 'flatfile-to-json.pl')

    reference_gbk = get_dataset_with_type(
            reference_genome,
            type=Dataset.TYPE.REFERENCE_GENOME_GENBANK).get_absolute_location()

    jbrowse_path = reference_genome.get_jbrowse_directory_path()

    reference_gff = get_dataset_with_type(
            reference_genome,
            type=Dataset.TYPE.REFERENCE_GENOME_GFF).get_absolute_location()

    json_update_fields = {
        'style': {
            'label': 'name,CDS,gene',
            'description': 'note,function,gene_synonym',
            'color': '#5fbcdd'
        }
    }

    genbank_json_command = [
        FLATFILE_TRACK_BIN,
        '--gff', reference_gff,
        '--out', os.path.join(jbrowse_path,'indiv_tracks','gbk'),
        '--type', JBROWSE_GBK_TYPES_TO_DISPLAY,
        '--autocomplete','all',
        '--trackLabel','gbk',
        '--key',"Genome Features",
        '--trackType',"CanvasFeatures",
        #'--getSubfeatures',
        #'--className','transcript',
        #'--subfeatureClasses', "{\"CDS\":\"transcript-CDS\"}"
    ]

    subprocess.check_call(genbank_json_command)

    # Finally, manually update tracklist json with style info
    tracklist_json = get_tracklist_json(reference_genome, 'gbk')

    for i, track in enumerate(tracklist_json['tracks']):
        if track['key'] == 'Genome Features':
            tracklist_json['tracks'][i] = merge_nested_dictionaries(
                    track, json_update_fields)

    write_tracklist_json(reference_genome, tracklist_json, 'gbk')
예제 #12
0
def get_discordant_read_pairs(sample_alignment):
    """Isolate discordant pairs of reads from a sample alignment.
    """
    # First, check if completed dataset already exists.
    bwa_disc_dataset = get_dataset_with_type(
            sample_alignment, Dataset.TYPE.BWA_DISCORDANT)
    if bwa_disc_dataset is not None:
        if (bwa_disc_dataset.status == Dataset.STATUS.READY and
                os.path.exists(bwa_disc_dataset.get_absolute_location())):
            return bwa_disc_dataset
    else:
        bwa_disc_dataset = Dataset.objects.create(
                label=Dataset.TYPE.BWA_DISCORDANT,
                type=Dataset.TYPE.BWA_DISCORDANT)
        sample_alignment.dataset_set.add(bwa_disc_dataset)

    # If here, we are going to run or re-run the Dataset.
    bwa_disc_dataset.status = Dataset.STATUS.NOT_STARTED
    bwa_disc_dataset.save(update_fields=['status'])

    bam_dataset = get_dataset_with_type(sample_alignment, Dataset.TYPE.BWA_ALIGN)
    bam_filename = bam_dataset.get_absolute_location()

    assert os.path.exists(bam_filename), "BAM file '%s' is missing." % (
            bam_filename)

    # NOTE: This assumes the index just adds at .bai, w/ same path otherwise
    # - will this always be true?
    if not os.path.exists(bam_filename+'.bai'):
        index_bam_file(bam_filename)

    bam_discordant_filename = os.path.join(sample_alignment.get_model_data_dir(),
            'bwa_discordant_pairs.bam')

    try:
        bwa_disc_dataset.status = Dataset.STATUS.COMPUTING
        bwa_disc_dataset.save(update_fields=['status'])
        extract_discordant_read_pairs(bam_filename, bam_discordant_filename)

    except subprocess.CalledProcessError:
        bwa_disc_dataset.filesystem_location = ''
        bwa_disc_dataset.status = Dataset.STATUS.FAILED
    finally:
        bwa_disc_dataset.status = Dataset.STATUS.READY
        bwa_disc_dataset.filesystem_location = clean_filesystem_location(
                bam_discordant_filename)

    bwa_disc_dataset.save()

    return bwa_disc_dataset
예제 #13
0
    def _fastqc_test_runner(self, fastq1_location, fastq2_location):
        """Helper that takes different fastqs as source.

        This function is a test itself.
        """
        # Run FastQC
        gz_backed_sample = self.common_entities['sample_1']
        gz_fastq1_dataset = copy_and_add_dataset_source(
                gz_backed_sample, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ1,
                fastq1_location)
        gz_fastq2_dataset = copy_and_add_dataset_source(
                gz_backed_sample, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ2,
                fastq2_location)
        run_fastqc_on_sample_fastq(gz_backed_sample, gz_fastq1_dataset)
        run_fastqc_on_sample_fastq(gz_backed_sample, gz_fastq2_dataset,
                rev=True)

        # We expect 2 Dataset per Fastq so 4 total.
        self.assertEqual(4, Dataset.objects.count())

        # Check link matches file extension.
        FASTQC_DATASET_TYPES = [
                Dataset.TYPE.FASTQC1_HTML, Dataset.TYPE.FASTQC2_HTML]
        for fastqc_dataset_type in FASTQC_DATASET_TYPES:
            fastqc_1_dataset = get_dataset_with_type(
                    gz_backed_sample, fastqc_dataset_type)
            assert os.path.exists(fastqc_1_dataset.get_absolute_location())
예제 #14
0
def get_features_at_locations(ref_genome, intervals, chromosome=None):
    """
    Use the genbank index dataset and return gene or mobile element names
    that are within these intervals.
    """
    feature_index_path = get_dataset_with_type(
        ref_genome, Dataset.TYPE.FEATURE_INDEX).get_absolute_location()

    with open(feature_index_path, 'r') as fh:

        gbk_feature_list = pickle.load(fh)

        # Dictionary of features to return, for each interval.
        return_features = {}

        # For each input interval, return a list of feature names that
        # overlap.
        for interval in intervals:
            q_ivl = pyinter.closedopen(*interval)
            features = [
                f_ivl for f_ivl in gbk_feature_list if q_ivl.intersect(f_ivl)
            ]

            return_features[interval] = features

        return return_features
예제 #15
0
    def _run_genome_finish_test(self, variant_set, target_fasta,
            mismatch_tolerance=0):

        self.assertTrue(variant_set.variants.exists(),
            'No placeable contigs found.')

        # Make new reference genome
        new_ref_genome_params = {'label': 'new_ref'}
        new_ref_genome = generate_new_reference_genome(
                variant_set, new_ref_genome_params)

        # Verify insertion was placed correctly
        new_ref_genome_fasta = get_dataset_with_type(
                new_ref_genome, Dataset.TYPE.REFERENCE_GENOME_FASTA
                ).get_absolute_location()

        fastas_same, indexes = are_fastas_same(
                target_fasta, new_ref_genome_fasta)

        indexes_str = str(indexes) if len(indexes) < 50 else (
                str(indexes[:50]) + '...')

        self.assertTrue(len(indexes) <= mismatch_tolerance,
                'Fastas dissimilar at indexes:' +
                indexes_str)
    def _fastqc_test_runner(self, fastq1_location, fastq2_location):
        """Helper that takes different fastqs as source.

        This function is a test itself.
        """
        # Run FastQC
        gz_backed_sample = self.common_entities['sample_1']
        gz_fastq1_dataset = copy_and_add_dataset_source(
                gz_backed_sample, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ1,
                fastq1_location)
        gz_fastq2_dataset = copy_and_add_dataset_source(
                gz_backed_sample, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ2,
                fastq2_location)
        run_fastqc_on_sample_fastq(gz_backed_sample, gz_fastq1_dataset)
        run_fastqc_on_sample_fastq(gz_backed_sample, gz_fastq2_dataset,
                rev=True)

        # We expect 2 Dataset per Fastq so 4 total.
        self.assertEqual(4, Dataset.objects.count())

        # Check link matches file extension.
        FASTQC_DATASET_TYPES = [
                Dataset.TYPE.FASTQC1_HTML, Dataset.TYPE.FASTQC2_HTML]
        for fastqc_dataset_type in FASTQC_DATASET_TYPES:
            fastqc_1_dataset = get_dataset_with_type(
                    gz_backed_sample, fastqc_dataset_type)
            assert os.path.exists(fastqc_1_dataset.get_absolute_location())
    def _run_genome_finish_test(self, variant_set, target_fasta,
            mismatch_tolerance=0):

        self.assertTrue(variant_set.variants.exists(),
            'No placeable contigs found.')

        # Make new reference genome
        new_ref_genome_params = {'label': 'new_ref'}
        new_ref_genome = generate_new_reference_genome(
                variant_set, new_ref_genome_params)

        # Verify insertion was placed correctly
        new_ref_genome_fasta = get_dataset_with_type(
                new_ref_genome, Dataset.TYPE.REFERENCE_GENOME_FASTA
                ).get_absolute_location()

        fastas_same, indexes = are_fastas_same(
                target_fasta, new_ref_genome_fasta)

        indexes_str = str(indexes) if len(indexes) < 50 else (
                str(indexes[:50]) + '...')

        self.assertTrue(len(indexes) <= mismatch_tolerance,
                'Fastas dissimilar at indexes:' +
                indexes_str)
예제 #18
0
def get_features_at_locations(ref_genome, intervals, chromosome=None):
    """
    Use the genbank index dataset and return gene or mobile element names
    that are within these intervals.
    """
    feature_index_path = get_dataset_with_type(ref_genome,
            Dataset.TYPE.FEATURE_INDEX).get_absolute_location()

    with open(feature_index_path, 'r') as fh:

        gbk_feature_list = pickle.load(fh)

        # Dictionary of features to return, for each interval.
        return_features = {}

        # For each input interval, return a list of feature names that
        # overlap.
        for interval in intervals:
            q_ivl = pyinter.closedopen(*interval)
            features = [f_ivl for f_ivl in gbk_feature_list if
                    q_ivl.intersect(f_ivl)]

            return_features[interval] = features

        return return_features
예제 #19
0
def _find_valid_sample_alignments(alignment_group, alignment_type):
    """ Returns a list sample alignment objects for an alignment,
        skipping those that failed. """
    sample_alignment_list = (
            alignment_group.experimentsampletoalignment_set.all())

    # Filter out mis-aligned files.
    # TODO: Should we show in the UI that some alignments failed and are
    # being skipped?
    def _is_successful_alignment(sample_alignment):
        bam_dataset = get_dataset_with_type(sample_alignment, alignment_type)
        return bam_dataset.status == Dataset.STATUS.READY
    sample_alignment_list = [sample_alignment for sample_alignment in
            sample_alignment_list if _is_successful_alignment(sample_alignment)]

    if len(sample_alignment_list) == 0:
        raise Exception('No successful alignments, Freebayes cannot proceed.')

    bam_files = [
            get_dataset_with_type(sa, alignment_type).get_absolute_location()
            for sa in sample_alignment_list]

    # Keep only valid bam_files
    valid_bam_files = []
    for bam_file in bam_files:
        if bam_file is None:
            continue
        if not os.stat(bam_file).st_size > 0:
            continue
        valid_bam_files.append(bam_file)
    assert len(valid_bam_files) == len(sample_alignment_list), (
            "Expected %d bam files, but found %d" % (
                    len(sample_alignment_list), len(bam_files)))
    return sample_alignment_list
def _vcf_to_vcftabix(vcf_dataset):
    """Compresses and indexes a vcf using samtools tabix.

    Creates a new Dataset model instance for this compressed version, with the
    same related objects (e.g. pointing to the same AlignmentGroup). The
    Dataset is flagged as compressed, indexed, etc.

    Args:
        vcf_dataset: Dataset pointing to a vcf, or its compressed version.
            Index may or may not exist.

    Returns:
        Dataset that points to compressed version of input vcf_dataset, if it
        wasn't compressed already. The index file is asserted to exist for this
        compressed Dataset.
    """
    ### This function has two steps:
    ###     1. Get or create compressed Dataset.
    ###     2. Create index if it doesn't exist.

    ### 1. Get or create compressed Dataset.
    if vcf_dataset.is_compressed():
        compressed_dataset = vcf_dataset
    else:
        # Check for existing compressed version using related model.
        # Assume that the first model will do.
        related_model = vcf_dataset.get_related_model_set().all()[0]
        compressed_dataset = get_dataset_with_type(
                entity=related_model,
                type=vcf_dataset.type,
                compressed=True)
        # If there is no compressed dataset, then make it
        if compressed_dataset is None:
            compressed_dataset = vcf_dataset.make_compressed('.bgz')

    ### 2. Create index if it doesn't exist.
    if compressed_dataset.filesystem_idx_location == '':

        # Set the tabix index location
        compressed_dataset.filesystem_idx_location = (
                compressed_dataset.filesystem_location + '.tbi')
        compressed_dataset.save()

        # Make tabix index
        subprocess.check_call([
            TABIX_BINARY, '-f',
            '-p', 'vcf',
            compressed_dataset.get_absolute_location()
        ])

    # Make sure the index exists, whether created now or previously.
    assert compressed_dataset.filesystem_idx_location == (
            compressed_dataset.filesystem_location + '.tbi'), (
            'Tabix index file location is not correct.')
    assert os.path.exists(
            compressed_dataset.get_absolute_idx_location()), (
            'Tabix index file does not exist on filesystem.')

    return compressed_dataset
예제 #21
0
def add_vcf_track(reference_genome, alignment_group, vcf_dataset_type):
    """DEPRECATED. Use add_vcf_track_given_dataset().
    """
    # Get the vcf file location from the the Dataset of the genome
    # keyed by the alignment_type.
    vcf_dataset = get_dataset_with_type(alignment_group, vcf_dataset_type)
    return add_vcf_track_given_dataset(reference_genome, alignment_group,
                                       vcf_dataset)
예제 #22
0
def prepare_ref_genome_related_datasets(ref_genome, dataset):
    """Prepares data related to a ReferenceGenome.

    For example, if only Genbank exists, creates a Fasta Dataset.

    If related Datasets exists, this function is a no-op.

    Args:
        ref_genome: ReferenceGenome.
        dataset: A dataset pointing to a genome.

    Raises:
        AssertionError if dataset status is NOT_STARTED.
    """
    assert dataset.status != Dataset.STATUS.NOT_STARTED

    if dataset.type == Dataset.TYPE.REFERENCE_GENOME_FASTA:

        # make sure the fasta index is generated

        # Run jbrowse ref genome processing
        prepare_jbrowse_ref_sequence(ref_genome)

    elif dataset.type == Dataset.TYPE.REFERENCE_GENOME_GENBANK:
        # Run snpeff build after creating ReferenceGenome obj.
        build_snpeff(ref_genome)

        # These functions are NO-OPS if the respective Datasets exist.
        generate_fasta_from_genbank(ref_genome)
        generate_gff_from_genbank(ref_genome)

        # Run jbrowse genbank genome processing for genes
        add_genbank_file_track(ref_genome)

        # Create an indexed set of intervals so we can find contigs
        # and snps within genes without using snpEFF.

        feature_index_output_path = os.path.join(
                ref_genome.get_snpeff_genbank_parent_dir(),
                'gbk_feature_idx.pickle')

        generate_gbk_feature_index(
                ref_genome.get_snpeff_genbank_file_path(),
                feature_index_output_path)

        gbk_idx_dataset = Dataset.objects.create(
                label=Dataset.TYPE.FEATURE_INDEX,
                type=Dataset.TYPE.FEATURE_INDEX)

        gbk_idx_dataset.filesystem_location = feature_index_output_path
        gbk_idx_dataset.save()
        ref_genome.dataset_set.add(gbk_idx_dataset)

    # We create the bwa index once here, so that alignments running in
    # parallel don't step on each others' toes.
    ref_genome_fasta = get_dataset_with_type(ref_genome,
            Dataset.TYPE.REFERENCE_GENOME_FASTA).get_absolute_location()
    ensure_bwa_index(ref_genome_fasta)
예제 #23
0
def add_vcf_track(reference_genome, alignment_group, vcf_dataset_type):
    """DEPRECATED. Use add_vcf_track_given_dataset().
    """
    # Get the vcf file location from the the Dataset of the genome
    # keyed by the alignment_type.
    vcf_dataset = get_dataset_with_type(alignment_group,
            vcf_dataset_type)
    return add_vcf_track_given_dataset(
        reference_genome, alignment_group, vcf_dataset)
예제 #24
0
def compute_callable_loci(reference_genome, sample_alignment,
            bam_file_location, stderr=None):

    # Set output fn to None in case try fails.
    callable_loci_bed_fn = None

    try:
        ref_genome_fasta_location = get_dataset_with_type(
                reference_genome,
                Dataset.TYPE.REFERENCE_GENOME_FASTA).get_absolute_location()

        output = _get_callable_loci_output_filename(bam_file_location)

        get_callable_loci(bam_file_location, output)

        # Add callable loci bed as dataset
        callable_loci_bed = Dataset.objects.create(
                label=Dataset.TYPE.BED_CALLABLE_LOCI,
                type=Dataset.TYPE.BED_CALLABLE_LOCI,
                filesystem_location=clean_filesystem_location(output))

        sample_alignment.dataset_set.add(callable_loci_bed)
        sample_alignment.save()

        callable_loci_bed_fn = callable_loci_bed.get_absolute_location()

        output = subprocess.check_output(
                ['cat', callable_loci_bed_fn])

        with open(callable_loci_bed_fn, 'w') as callable_loci_bed_fh:
            for i, line in enumerate(output.split('\n')):
                try:
                    fields = line.split()
                    if len(fields) == 0:
                        continue
                    chrom, start, end, feature = fields

                    feature = titlecase_spaces(feature)
                    # Bed feature can't have spaces =(
                    feature = feature.replace(' ', '_')

                    print >> callable_loci_bed_fh, '\t'.join(
                            [chrom, start, end, feature])
                except Exception as e:
                    print >> stderr, (
                        'WARNING: Callable Loci line' +
                        '%d: (%s) couldn\'t be parsed: %s') % (
                                i, line, str(e))

        # add it as a jbrowse track
        add_bed_file_track(reference_genome, sample_alignment, callable_loci_bed)

    except Exception as e:
        print >> stderr, 'WARNING: Callable Loci failed.'
        print >> stderr, str(e)

    return callable_loci_bed_fn
예제 #25
0
 def _compute():
     """Calls compute function then recursively calls
     get_insert_size_mean_and_stdev().
     """
     bam_file = get_dataset_with_type(sample_alignment,
             Dataset.TYPE.BWA_ALIGN).get_absolute_location()
     compute_insert_metrics(bam_file, sample_alignment, stderr=stderr)
     return get_insert_size_mean_and_stdev(sample_alignment, stderr,
             _iteration=_iteration + 1)
예제 #26
0
 def _compute():
     """Calls compute function then recursively calls
     get_insert_size_mean_and_stdev().
     """
     bam_file = get_dataset_with_type(
         sample_alignment, Dataset.TYPE.BWA_ALIGN).get_absolute_location()
     compute_insert_metrics(bam_file, sample_alignment, stderr=stderr)
     return get_insert_size_mean_and_stdev(sample_alignment,
                                           stderr,
                                           _iteration=_iteration + 1)
예제 #27
0
def run_pindel(fasta_ref, sample_alignments, vcf_output_dir,
        vcf_output_filename, alignment_type, **kwargs):
    """Run pindel to find SVs."""
    if not os.path.isdir('%s/pindel' % settings.TOOLS_DIR):
        raise Exception('Pindel is not installed. Aborting.')

    bam_files = [
            get_dataset_with_type(sa, alignment_type).get_absolute_location()
            for sa in sample_alignments]

    samples = [sa.experiment_sample for sa in sample_alignments]
    insert_sizes = [get_insert_size_mean_and_stdev(sa) for sa in
            sample_alignments]

    assert len(bam_files) == len(insert_sizes)

    # Create pindel config file
    pindel_config = os.path.join(vcf_output_dir, 'pindel_config.txt')
    at_least_one_config_line_written = False
    with open(pindel_config, 'w') as fh:
        for bam_file, sample, insert_size in zip(
                bam_files, samples, insert_sizes):

            # Skip bad alignments.
            mean, stdev = insert_size
            if mean == -1:
                continue
            fh.write('%s %s %s\n' % (bam_file, mean, sample.uid))
            at_least_one_config_line_written = True

    if not at_least_one_config_line_written:
        raise Exception
        return False # failure

    # Build the full pindel command.
    pindel_root = vcf_output_filename[:-4]  # get rid of .vcf extension
    subprocess.check_call(['%s/pindel/pindel' % settings.TOOLS_DIR,
        '-f', fasta_ref,
        '-i', pindel_config,
        '-c', 'ALL',
        '-o', pindel_root
    ])

    # convert all different structural variant types to vcf
    subprocess.check_call(['%s/pindel/pindel2vcf' % settings.TOOLS_DIR,
        '-P', pindel_root,
        '-r', fasta_ref,
        '-R', 'name',
        '-d', 'date',
        '-mc', '1',  # just need one read to show 1/1 in vcf
    ])

    postprocess_pindel_vcf(vcf_output_filename)

    return True # success
def run_freebayes(fasta_ref, sample_alignments, vcf_output_dir,
        vcf_output_filename, alignment_type, region=None, **kwargs):
    """Run freebayes using the bam alignment files keyed by the alignment_type
    for all Genomes of the passed in ReferenceGenome.

    NOTE: If a Genome doesn't have a bam alignment file with this
    alignment_type, then it won't be used.

    Returns:
        Boolean, True if successfully made it to the end, else False.
    """
    bam_files = [
            get_dataset_with_type(sa, alignment_type).get_absolute_location()
            for sa in sample_alignments]

    # Build up the bam part of the freebayes binary call.
    bam_part = []
    for bam_file in bam_files:
        bam_part.append('--bam')
        bam_part.append(bam_file)

    # Determine alignment ploidy (haploid or diploid).
    alignment_group = sample_alignments[0].alignment_group
    if alignment_group.alignment_options['call_as_haploid']:
        alignment_ploidy = 1
    else:
        alignment_ploidy = 2

    other_args_part = [
        '--fasta-reference', fasta_ref,
        '--pvar', '0.001',
        '--ploidy', str(alignment_ploidy),
        '--min-alternate-fraction', '.3',
        '--hwe-priors-off',
        # '--binomial-obs-priors-off',
        '--use-mapping-quality',
        '--min-base-quality', '25',
        '--min-mapping-quality', '30'
    ]

    if region:
        other_args_part.extend(['--region',region])

    # Build the full command and execute it for all bam files at once.
    full_command = (
            ['%s/freebayes/freebayes' % settings.TOOLS_DIR] +
            bam_part +
            other_args_part)

    with open(vcf_output_filename, 'w') as fh:
        subprocess.check_call(full_command, stdout=fh)

    return True # success
예제 #29
0
    def test_generate_genbank_mobile_element_multifasta(self):
        """Test generation of the mobile element fasta.
        """
        self.reference_genome = import_reference_genome_from_local_file(
                self.project, 'ref_genome', TEST_GENBANK, 'genbank')
        self.reference_genome.ensure_mobile_element_multifasta()

        me_fa_dataset = get_dataset_with_type(
                self.reference_genome,
                Dataset.TYPE.MOBILE_ELEMENT_FASTA)

        assert os.path.exists(
                me_fa_dataset.get_absolute_location())
예제 #30
0
def run_lumpy(fasta_ref, sample_alignments, vcf_output_dir,
              vcf_output_filename, alignment_type, **kwargs):
    """Runs lumpy.
    """
    print 'RUNNING LUMPY...'

    # NOTE: Only supporting single sample alignment for now. Previously we
    # tried to use lumpy for multiple sample alignments but the machine would
    # run out of memory so we are going to limit functionality to single
    # alignment only for now.
    assert len(sample_alignments) == 1

    # Get relevant files. Note this is written to handle more than 1 sample
    # although right now we are not running lumpy on more than one sample at
    # a time as enforced by the assert above.
    bam_file_list = []
    bam_disc_file_list = []
    bam_sr_file_list = []
    for sa in sample_alignments:
        bam_dataset = get_dataset_with_type(sa, Dataset.TYPE.BWA_ALIGN)
        bam_file_list.append(bam_dataset.get_absolute_location())

        # Get or create discordant reads.
        bam_disc_dataset = get_discordant_read_pairs(sa)
        bam_disc_file_list.append(bam_disc_dataset.get_absolute_location())

        # Get or create split reads.
        bam_sr_dataset = get_split_reads(sa)
        bam_sr_file_list.append(bam_sr_dataset.get_absolute_location())

    lumpy_cmd = [
        settings.LUMPY_EXPRESS_BINARY,
        '-B',
        ','.join(bam_file_list),
        '-S',
        ','.join(bam_sr_file_list),
        '-D',
        ','.join(bam_disc_file_list),
        '-o',
        vcf_output_filename,
        '-P'  # get probability distributions, required for merge
    ]

    print ' '.join(lumpy_cmd)

    # Run Lumpy Express.
    lumpy_error_output = vcf_output_filename + '.error'
    with open(lumpy_error_output, 'w') as error_output_fh:
        subprocess.check_call(lumpy_cmd, stderr=error_output_fh)

    return True  # success
예제 #31
0
def get_insert_size_mean_and_stdev(sample_alignment,
                                   stderr=None,
                                   _iteration=0):
    """Returns a tuple (mean, stdev) for insert sizes from the alignment.

    Calls the compute functoin if metrics don't exist.

    If the insert size can't be calculated, perhaps because of a bad alignment,
    returns (-1, -1).

    Args:
        sample_alignment: ExperimentSampleToAlignment we want metrics for.
        iteration: Used internally to avoid getting stuck in case where
            computation repeatedly fails.

    Returns:
        Tuple of ints (mean, stdev).
    """
    # Prevent getting stuck in case computation keeps failing.
    if _iteration >= 3:
        return (-1, -1)

    def _compute():
        """Calls compute function then recursively calls
        get_insert_size_mean_and_stdev().
        """
        bam_file = get_dataset_with_type(
            sample_alignment, Dataset.TYPE.BWA_ALIGN).get_absolute_location()
        compute_insert_metrics(bam_file, sample_alignment, stderr=stderr)
        return get_insert_size_mean_and_stdev(sample_alignment,
                                              stderr,
                                              _iteration=_iteration + 1)

    mean_stdev_dataset = get_dataset_with_type(
        sample_alignment, Dataset.TYPE.LUMPY_INSERT_METRICS_MEAN_STDEV)
    if not mean_stdev_dataset:
        return _compute()

    file_location = mean_stdev_dataset.get_absolute_location()
    if not os.path.exists(file_location):
        return _compute()

    with open(file_location) as fh:
        combined_str = fh.read().strip()
        parts = combined_str.split(',')
        if not len(parts) == 2:
            return _compute()
        else:
            return tuple([int(p) for p in parts])
예제 #32
0
def flag_variants_from_bed(alignment_group, bed_dataset_type):
    sample_alignments = alignment_group.experimentsampletoalignment_set.all()
    for sample_alignment in sample_alignments:

        # If there is no callable_loci bed, skip the sample alignment.
        # TODO: Make this extensible to other BED files we might have
        callable_loci_bed = get_dataset_with_type(
            entity=sample_alignment, type=Dataset.TYPE.BED_CALLABLE_LOCI)

        if not callable_loci_bed:
            continue

        # need to add sample_alignment and bed_dataset here.
        add_variants_to_set_from_bed(sample_alignment=sample_alignment,
                                     bed_dataset=callable_loci_bed)
예제 #33
0
def run_lumpy(
        fasta_ref, sample_alignments, vcf_output_dir, vcf_output_filename,
        alignment_type, **kwargs):
    """Runs lumpy.
    """
    print 'RUNNING LUMPY...'

    # NOTE: Only supporting single sample alignment for now. Previously we
    # tried to use lumpy for multiple sample alignments but the machine would
    # run out of memory so we are going to limit functionality to single
    # alignment only for now.
    assert len(sample_alignments) == 1

    # Get relevant files. Note this is written to handle more than 1 sample
    # although right now we are not running lumpy on more than one sample at
    # a time as enforced by the assert above.
    bam_file_list = []
    bam_disc_file_list = []
    bam_sr_file_list = []
    for sa in sample_alignments:
        bam_dataset = get_dataset_with_type(sa, Dataset.TYPE.BWA_ALIGN)
        bam_file_list.append(bam_dataset.get_absolute_location())

        # Get or create discordant reads.
        bam_disc_dataset = get_discordant_read_pairs(sa)
        bam_disc_file_list.append(bam_disc_dataset.get_absolute_location())

        # Get or create split reads.
        bam_sr_dataset = get_split_reads(sa)
        bam_sr_file_list.append(bam_sr_dataset.get_absolute_location())

    lumpy_cmd = [
        settings.LUMPY_EXPRESS_BINARY,
        '-B', ','.join(bam_file_list),
        '-S', ','.join(bam_sr_file_list),
        '-D', ','.join(bam_disc_file_list),
        '-o', vcf_output_filename,
        '-P' # get probability distributions, required for merge
    ]

    print ' '.join(lumpy_cmd)

    # Run Lumpy Express.
    lumpy_error_output = vcf_output_filename + '.error'
    with open(lumpy_error_output, 'w') as error_output_fh:
        subprocess.check_call(lumpy_cmd, stderr=error_output_fh)

    return True  # success
def main():
    if not os.path.exists(OUTPUT_DIR):
        os.mkdir(OUTPUT_DIR)

    for sa in ExperimentSampleToAlignment.objects.all():
        histo_dataset = get_dataset_with_type(sa,
                Dataset.TYPE.LUMPY_INSERT_METRICS_HISTOGRAM)
        histo_dataset_full_path = histo_dataset.get_absolute_location()

        # Update Dataset name.
        histo_dataset_name = (
                os.path.split(os.path.split(histo_dataset_full_path)[0])[1] + '.txt')

        # Copy.
        new_full_path = os.path.join(OUTPUT_DIR, histo_dataset_name)
        shutil.copyfile(histo_dataset_full_path, new_full_path)
예제 #35
0
def prepare_jbrowse_ref_sequence(reference_genome, **kwargs):
    """Prepare the reference sequence and place it in the ref_genome dir.

    This implicitly creates the config directory structure for this reference
    genome. Tracks added in the future are added relative to this reference
    genome.

    The implemenation of this method is a light wrapper around
    jbrowse/bin/prepare-refseqs.pl.
    """
    PREPARE_REFSEQS_BIN = os.path.join(JBROWSE_BIN_PATH, 'prepare-refseqs.pl')

    # First ensure that the reference genome exists. If it fails, try to
    # convert from genbank, then give up.
    reference_fasta = get_dataset_with_type(
        reference_genome,
        type=Dataset.TYPE.REFERENCE_GENOME_FASTA).get_absolute_location()

    # Next, ensure that the jbrowse directory exists.
    reference_genome.ensure_jbrowse_dir()
    jbrowse_path = os.path.join(reference_genome.get_jbrowse_directory_path(),
                                'indiv_tracks', 'DNA')

    # Now run prepare-refseqs.pl to get the ReferenceGenome in.
    subprocess.check_call([
        PREPARE_REFSEQS_BIN,
        '--fasta',
        reference_fasta,
        '--out',
        jbrowse_path,
    ])

    json_tracks = get_tracklist_json(reference_genome, 'DNA')

    # DNA track should be the first track
    dna_track = json_tracks['tracks'][0]
    assert dna_track['type'] == 'SequenceTrack'

    # Get rid of translation and reverse strand
    dna_track.update({
        "showForwardStrand": True,
        "showReverseStrand": False,
        "showTranslation": False
    })

    write_tracklist_json(reference_genome, json_tracks, 'DNA')
예제 #36
0
def get_insert_size_mean_and_stdev(sample_alignment, stderr=None, _iteration=0):
    """Returns a tuple (mean, stdev) for insert sizes from the alignment.

    Calls the compute functoin if metrics don't exist.

    If the insert size can't be calculated, perhaps because of a bad alignment,
    returns (-1, -1).

    Args:
        sample_alignment: ExperimentSampleToAlignment we want metrics for.
        iteration: Used internally to avoid getting stuck in case where
            computation repeatedly fails.

    Returns:
        Tuple of ints (mean, stdev).
    """
    # Prevent getting stuck in case computation keeps failing.
    if _iteration >= 3:
        return (-1, -1)

    def _compute():
        """Calls compute function then recursively calls
        get_insert_size_mean_and_stdev().
        """
        bam_file = get_dataset_with_type(sample_alignment,
                Dataset.TYPE.BWA_ALIGN).get_absolute_location()
        compute_insert_metrics(bam_file, sample_alignment, stderr=stderr)
        return get_insert_size_mean_and_stdev(sample_alignment, stderr,
                _iteration=_iteration + 1)

    mean_stdev_dataset = get_dataset_with_type(sample_alignment,
            Dataset.TYPE.LUMPY_INSERT_METRICS_MEAN_STDEV)
    if not mean_stdev_dataset:
        return _compute()

    file_location = mean_stdev_dataset.get_absolute_location()
    if not os.path.exists(file_location):
        return _compute()

    with open(file_location) as fh:
        combined_str = fh.read().strip()
        parts = combined_str.split(',')
        if not len(parts) == 2:
            return _compute()
        else:
            return tuple([int(p) for p in parts])
예제 #37
0
def flag_variants_from_bed(alignment_group, bed_dataset_type):
    sample_alignments = alignment_group.experimentsampletoalignment_set.all()
    for sample_alignment in sample_alignments:

        # If there is no callable_loci bed, skip the sample alignment.
        # TODO: Make this extensible to other BED files we might have
        callable_loci_bed = get_dataset_with_type(
                entity=sample_alignment,
                type=Dataset.TYPE.BED_CALLABLE_LOCI)

        if not callable_loci_bed:
            continue

        # need to add sample_alignment and bed_dataset here.
        add_variants_to_set_from_bed(
                sample_alignment=sample_alignment,
                bed_dataset=callable_loci_bed)
예제 #38
0
def prepare_jbrowse_ref_sequence(reference_genome, **kwargs):
    """Prepare the reference sequence and place it in the ref_genome dir.

    This implicitly creates the config directory structure for this reference
    genome. Tracks added in the future are added relative to this reference
    genome.

    The implemenation of this method is a light wrapper around
    jbrowse/bin/prepare-refseqs.pl.
    """
    PREPARE_REFSEQS_BIN = os.path.join(JBROWSE_BIN_PATH, 'prepare-refseqs.pl')

    # First ensure that the reference genome exists. If it fails, try to
    # convert from genbank, then give up.
    reference_fasta = get_dataset_with_type(
            reference_genome,
            type=Dataset.TYPE.REFERENCE_GENOME_FASTA).get_absolute_location()

    # Next, ensure that the jbrowse directory exists.
    reference_genome.ensure_jbrowse_dir()
    jbrowse_path = os.path.join(
            reference_genome.get_jbrowse_directory_path(),
            'indiv_tracks',
            'DNA')

    # Now run prepare-refseqs.pl to get the ReferenceGenome in.
    subprocess.check_call([
        PREPARE_REFSEQS_BIN,
        '--fasta', reference_fasta,
        '--out', jbrowse_path,
    ])

    json_tracks = get_tracklist_json(reference_genome, 'DNA')

    # DNA track should be the first track
    dna_track = json_tracks['tracks'][0]
    assert dna_track['type'] == 'SequenceTrack'

    # Get rid of translation and reverse strand
    dna_track.update({
        "showForwardStrand": True,
        "showReverseStrand": False,
        "showTranslation": False
        })

    write_tracklist_json(reference_genome, json_tracks, 'DNA')
예제 #39
0
    def derivation_fn(sample_alignment, unmapped_reads_dataset):
        # Get the original bam file.
        bam_dataset = get_dataset_with_type(sample_alignment,
                                            Dataset.TYPE.BWA_ALIGN)
        bam_filename = bam_dataset.get_absolute_location()

        # Allocate a filename for the unmapped reads.
        unmapped_reads_bam_file = (os.path.splitext(bam_filename)[0] +
                                   '.unmapped.bam')
        unmapped_reads_dataset.filesystem_location = clean_filesystem_location(
            unmapped_reads_bam_file)
        unmapped_reads_dataset.save(update_fields=['filesystem_location'])

        cmd = '{samtools} view -h -b -f 0x4 {bam_filename}'.format(
            samtools=settings.SAMTOOLS_BINARY, bam_filename=bam_filename)
        with open(unmapped_reads_bam_file, 'w') as output_fh:
            subprocess.check_call(cmd, stdout=output_fh, shell=True)
예제 #40
0
def get_vcf_files(alignment_group):
    """Gets vcf files related to the AlignmentGroup.

    Returns:
        Dict mapping from vcf type to file location.
    """
    vcf_files = {}
    vcf_types = [VARIANT_TOOL_PARAMS_MAP[tool]['dataset_type']
            for tool in settings.ENABLED_VARIANT_CALLERS]
    for vcf_type in vcf_types:
        vcf_dataset = get_dataset_with_type(alignment_group, vcf_type)
        if vcf_dataset is None:
            continue
        vcf_location = vcf_dataset.get_absolute_location()
        assert os.path.exists(vcf_location)
        vcf_files[vcf_type] = vcf_location
    return vcf_files
예제 #41
0
def main():
    if not os.path.exists(OUTPUT_DIR):
        os.mkdir(OUTPUT_DIR)

    for sa in ExperimentSampleToAlignment.objects.all():
        histo_dataset = get_dataset_with_type(
            sa, Dataset.TYPE.LUMPY_INSERT_METRICS_HISTOGRAM)
        histo_dataset_full_path = histo_dataset.get_absolute_location()

        # Update Dataset name.
        histo_dataset_name = (
            os.path.split(os.path.split(histo_dataset_full_path)[0])[1] +
            '.txt')

        # Copy.
        new_full_path = os.path.join(OUTPUT_DIR, histo_dataset_name)
        shutil.copyfile(histo_dataset_full_path, new_full_path)
예제 #42
0
    def derivation_fn(sample_alignment, unmapped_reads_dataset):
        # Get the original bam file.
        bam_dataset = get_dataset_with_type(sample_alignment,
                Dataset.TYPE.BWA_ALIGN)
        bam_filename = bam_dataset.get_absolute_location()

        # Allocate a filename for the unmapped reads.
        unmapped_reads_bam_file = (os.path.splitext(bam_filename)[0] +
                '.unmapped.bam')
        unmapped_reads_dataset.filesystem_location = clean_filesystem_location(
                unmapped_reads_bam_file)
        unmapped_reads_dataset.save(update_fields=['filesystem_location'])

        cmd = '{samtools} view -h -b -f 0x4 {bam_filename}'.format(
                samtools=settings.SAMTOOLS_BINARY,
                bam_filename=bam_filename)
        with open(unmapped_reads_bam_file, 'w') as output_fh:
           subprocess.check_call(cmd, stdout=output_fh, shell=True)
예제 #43
0
def get_vcf_files(alignment_group):
    """Gets vcf files related to the AlignmentGroup.

    Returns:
        Dict mapping from vcf type to file location.
    """
    vcf_files = {}
    vcf_types = [
        VARIANT_TOOL_PARAMS_MAP[tool]['dataset_type']
        for tool in settings.ENABLED_VARIANT_CALLERS
    ]
    for vcf_type in vcf_types:
        vcf_dataset = get_dataset_with_type(alignment_group, vcf_type)
        if vcf_dataset is None:
            continue
        vcf_location = vcf_dataset.get_absolute_location()
        assert os.path.exists(vcf_location)
        vcf_files[vcf_type] = vcf_location
    return vcf_files
예제 #44
0
def compute_callable_loci(reference_genome,
                          sample_alignment,
                          bam_file_location,
                          stderr=None):

    # Set output fn to None in case try fails.
    callable_loci_bed_fn = None

    try:
        ref_genome_fasta_location = get_dataset_with_type(
            reference_genome,
            Dataset.TYPE.REFERENCE_GENOME_FASTA).get_absolute_location()

        callable_loci_bed_fn = (
            _get_callable_loci_output_filename(bam_file_location))

        get_callable_loci(bam_file_location, callable_loci_bed_fn)

        # Add callable loci bed as dataset
        callable_loci_bed_dataset = Dataset.objects.create(
            label=Dataset.TYPE.BED_CALLABLE_LOCI,
            type=Dataset.TYPE.BED_CALLABLE_LOCI,
            filesystem_location=clean_filesystem_location(
                callable_loci_bed_fn))

        sample_alignment.dataset_set.add(callable_loci_bed_dataset)
        sample_alignment.save()

        clean_bed_fn = clean_bed_features(callable_loci_bed_dataset,
                                          stderr=stderr)

        # add it as a jbrowse track
        add_bed_file_track(reference_genome, sample_alignment,
                           callable_loci_bed_dataset)

    except Exception as e:
        print >> stderr, 'WARNING: Callable Loci failed.'
        print >> stderr, str(e)
        clean_bed_fn = ''

    finally:
        return clean_bed_fn
예제 #45
0
    def test_dataset_strings(self):

        user = User.objects.create_user(TEST_USERNAME,
                                        password=TEST_PASSWORD,
                                        email=TEST_EMAIL)

        self.test_project = Project.objects.create(title=TEST_PROJECT_NAME,
                                                   owner=user.get_profile())

        self.test_ref_genome = import_reference_genome_from_local_file(
            self.test_project, TEST_REF_GENOME_NAME, TEST_REF_GENOME_PATH,
            'genbank')

        dataset = get_dataset_with_type(
            self.test_ref_genome, type=Dataset.TYPE.REFERENCE_GENOME_GENBANK)

        self.assertEquals(
            dataset.internal_string(self.test_ref_genome),
            (str(self.test_ref_genome.uid) + '_' +
             uppercase_underscore(Dataset.TYPE.REFERENCE_GENOME_GENBANK)))
예제 #46
0
def compute_callable_loci(reference_genome, sample_alignment,
            bam_file_location, stderr=None):

    # Set output fn to None in case try fails.
    callable_loci_bed_fn = None

    try:
        ref_genome_fasta_location = get_dataset_with_type(
                reference_genome,
                Dataset.TYPE.REFERENCE_GENOME_FASTA).get_absolute_location()

        callable_loci_bed_fn = (
                _get_callable_loci_output_filename(bam_file_location))

        get_callable_loci(bam_file_location, callable_loci_bed_fn)

        # Add callable loci bed as dataset
        callable_loci_bed_dataset = Dataset.objects.create(
                label=Dataset.TYPE.BED_CALLABLE_LOCI,
                type=Dataset.TYPE.BED_CALLABLE_LOCI,
                filesystem_location=clean_filesystem_location(callable_loci_bed_fn))

        sample_alignment.dataset_set.add(callable_loci_bed_dataset)
        sample_alignment.save()

        clean_bed_fn = clean_bed_features(callable_loci_bed_dataset, stderr=stderr)

        # add it as a jbrowse track
        add_bed_file_track(
                reference_genome,
                sample_alignment,
                callable_loci_bed_dataset)


    except Exception as e:
        print >> stderr, 'WARNING: Callable Loci failed.'
        print >> stderr, str(e)
        clean_bed_fn = ''

    finally:
        return clean_bed_fn
예제 #47
0
    def _run_genome_finish_test(self, data_dict, mismatch_tolerance=0):

        contigs = self._perform_assembly(data_dict)

        # Assert contigs were generated
        self.assertTrue(contigs.count() > 0)
        self.assertTrue(contigs[0].num_bases > 0)

        ag = contigs[0].experiment_sample_to_alignment.alignment_group

        # Get set of de novo variants
        variant_set = create_de_novo_variants_set(ag, 'de_novo_variants')

        contigs_found_error_str = (str(len(contigs)) + ' found with lengths:' +
            ', '.join([str(c.num_bases) for c in contigs]))

        self.assertTrue(variant_set.variants.exists(),
            'No placeable contigs found.  ' +
            contigs_found_error_str)

        # Make new reference genome
        new_ref_genome_params = {'label': 'new_ref'}
        new_ref_genome = generate_new_reference_genome(
                variant_set, new_ref_genome_params)

        # Verify insertion was placed correctly
        target_fasta = data_dict['target_fasta']
        new_ref_genome_fasta = get_dataset_with_type(
                new_ref_genome, Dataset.TYPE.REFERENCE_GENOME_FASTA
                ).get_absolute_location()

        fastas_same, indexes = are_fastas_same(
                target_fasta, new_ref_genome_fasta)

        indexes_str = str(indexes) if len(indexes) < 50 else (
                str(indexes[:50]) + '...')

        self.assertTrue(len(indexes) <= mismatch_tolerance,
                'Fastas dissimilar at indexes:' +
                indexes_str + '\n' +
                contigs_found_error_str)
def add_vcf_track(reference_genome, alignment_group, vcf_dataset_type):
    """Adds a vcf track to JBrowse for this vcf.

    See JBrowse Docs:
        http://gmod.org/wiki/JBrowse_Configuration_Guide#Example_VCF-based_Variant_Track_Configuration
    """
    # Get the vcf file location from the the Dataset of the genome
    # keyed by the alignment_type.
    vcf_dataset = get_dataset_with_type(alignment_group,
            vcf_dataset_type)

    vcf_dataset = _vcf_to_vcftabix(vcf_dataset)

    if reference_genome.project.is_s3_backed():
        urlTemplate = os.path.join('http://%s.s3.amazonaws.com/' % S3_BUCKET,
            vcf_dataset.filesystem_location.strip("/jbrowse"))
        urlTemplate_idx = os.path.join('http://%s.s3.amazonaws.com/' % S3_BUCKET,
            vcf_dataset.filesystem_idx_location.strip("/jbrowse"))
    else:
        urlTemplate = os.path.join(JBROWSE_DATA_URL_ROOT,
            vcf_dataset.filesystem_location)
        urlTemplate_idx = os.path.join(JBROWSE_DATA_URL_ROOT,
            vcf_dataset.filesystem_idx_location)

    label = vcf_dataset.internal_string(alignment_group)
    key = "{:s} {:s} SNVs".format(vcf_dataset.type,alignment_group.label)

    # Build the JSON object.
    raw_dict_obj = {
        'tracks' : [{
            "label"         : label,
            "key"           : key,
            "storeClass"    : "JBrowse/Store/SeqFeature/VCFTabix",
            "urlTemplate"   : urlTemplate,
            "tbiUrlTemplate": urlTemplate_idx,
            'category'      : 'VCF Tracks',
            "type"          : "JBrowse/View/Track/HTMLVariants"
        }]
    }

    write_tracklist_json(reference_genome, raw_dict_obj, label)
예제 #49
0
def fastqc_view(request, project_uid, sample_uid, read_num):

    project = get_object_or_404(Project, owner=request.user.get_profile(),
            uid=project_uid)

    sample = get_object_or_404(ExperimentSample, project=project,
            uid=sample_uid)

    if int(read_num) == 1:
        dataset_type = Dataset.TYPE.FASTQC1_HTML
    elif int(read_num) == 2:
        dataset_type = Dataset.TYPE.FASTQC2_HTML
    else:
        raise Exception('Read number must be 1 or 2')

    fastqc_dataset = get_dataset_with_type(sample, dataset_type)

    response = HttpResponse(mimetype="text/html")
    for line in open(fastqc_dataset.get_absolute_location()):
        response.write(line)
    return response
예제 #50
0
def fastqc_view(request, project_uid, sample_uid, read_num):

    project = get_object_or_404(Project, owner=request.user.get_profile(),
            uid=project_uid)

    sample = get_object_or_404(ExperimentSample, project=project,
            uid=sample_uid)

    if int(read_num) == 1:
        dataset_type = Dataset.TYPE.FASTQC1_HTML
    elif int(read_num) == 2:
        dataset_type = Dataset.TYPE.FASTQC2_HTML
    else:
        raise Exception('Read number must be 1 or 2')

    fastqc_dataset = get_dataset_with_type(sample, dataset_type)

    response = HttpResponse(mimetype="text/html")
    for line in open(fastqc_dataset.get_absolute_location()):
        response.write(line)
    return response
예제 #51
0
def prepare_ref_genome_related_datasets(ref_genome, dataset):
    """Prepares data related to a ReferenceGenome.

    For example, if only Genbank exists, creates a Fasta Dataset.

    If related Datasets exists, this function is a no-op.

    Args:
        ref_genome: ReferenceGenome.
        dataset: A dataset pointing to a genome.

    Raises:
        AssertionError if dataset status is NOT_STARTED.
    """
    assert dataset.status != Dataset.STATUS.NOT_STARTED

    if dataset.type == Dataset.TYPE.REFERENCE_GENOME_FASTA:

        # make sure the fasta index is generated


        # Run jbrowse ref genome processing
        prepare_jbrowse_ref_sequence(ref_genome)

    elif dataset.type == Dataset.TYPE.REFERENCE_GENOME_GENBANK:
        # Run snpeff build after creating ReferenceGenome obj.
        build_snpeff(ref_genome)

        # These functions are NO-OPS if the respective Datasets exist.
        generate_fasta_from_genbank(ref_genome)
        generate_gff_from_genbank(ref_genome)

        # Run jbrowse genbank genome processing for genes
        add_genbank_file_track(ref_genome)

    # We create the bwa index once here, so that alignments running in
    # parallel don't step on each others' toes.
    ref_genome_fasta = get_dataset_with_type(ref_genome,
            Dataset.TYPE.REFERENCE_GENOME_FASTA).get_absolute_location()
    ensure_bwa_index(ref_genome_fasta)
예제 #52
0
def generate_gff_from_genbank(ref_genome):
    """If this reference genome has a genbank but not a GFF, generate
    a GFF from the genbank. """

    # If a GFF already exists, then just return.
    if ref_genome.dataset_set.filter(
            type=Dataset.TYPE.REFERENCE_GENOME_GFF).exists():
        return

    # Check that a genbank exists.
    assert ref_genome.dataset_set.filter(
            type=Dataset.TYPE.REFERENCE_GENOME_GENBANK).exists()

    # Get genbank path and filename components (for creating GFF file name).
    genbank_path = get_dataset_with_type(
            ref_genome,
            type=Dataset.TYPE.REFERENCE_GENOME_GENBANK).get_absolute_location()

    genbank_dir, genbank_filename = os.path.split(genbank_path)
    genbank_noext = os.path.splitext(genbank_filename)[0]

    # Put the GFF file in the same dir, just change the extension to .gff.
    gff_filename = os.path.join(genbank_dir, (genbank_noext + '.gff'))

    # Get the individual records, each corresponding to a chromosome.
    genome_records = list(SeqIO.parse(genbank_path, 'genbank'))

    # SnpEFF takes the name attr, but the BioPython uses the id attr to make its
    # GFF file, so overwrite the id with the name when converting to GFF.

    for genome_record in genome_records:
        genome_record.name = genome_record.id

    GFF.write(genome_records, open(gff_filename, 'w'))

    dataset_type = IMPORT_FORMAT_TO_DATASET_TYPE['gff']
    copy_and_add_dataset_source(ref_genome, dataset_type,
            dataset_type, gff_filename)
예제 #53
0
def _find_valid_sample_alignments(alignment_group, alignment_type):
    """ Returns a list sample alignment objects for an alignment,
        skipping those that failed. """
    sample_alignment_list = (
        alignment_group.experimentsampletoalignment_set.all())

    # Filter out mis-aligned files.
    # TODO: Should we show in the UI that some alignments failed and are
    # being skipped?
    def _is_successful_alignment(sample_alignment):
        bam_dataset = get_dataset_with_type(sample_alignment, alignment_type)
        return bam_dataset.status == Dataset.STATUS.READY

    sample_alignment_list = [
        sample_alignment for sample_alignment in sample_alignment_list
        if _is_successful_alignment(sample_alignment)
    ]

    if len(sample_alignment_list) == 0:
        raise Exception('No successful alignments, Freebayes cannot proceed.')

    bam_files = [
        get_dataset_with_type(sa, alignment_type).get_absolute_location()
        for sa in sample_alignment_list
    ]

    # Keep only valid bam_files
    valid_bam_files = []
    for bam_file in bam_files:
        if bam_file is None:
            continue
        if not os.stat(bam_file).st_size > 0:
            continue
        valid_bam_files.append(bam_file)
    assert len(valid_bam_files) == len(sample_alignment_list), (
        "Expected %d bam files, but found %d" %
        (len(sample_alignment_list), len(bam_files)))
    return sample_alignment_list
def add_contig_reads_bam_track(contig, alignment_type):
    """Update the JBrowse track config file, trackList.json, for this
    ReferenceGenome with a track for the given sample_alignment and
    alignment_type.
    """
    # Get the bam file location from the the Dataset of the genome
    # keyed by the alignment_type.
    bam_dataset = get_dataset_with_type(contig, alignment_type)

    # Figure out the url that JBrowse would use to show the data, e.g.:
    #     /jbrowse/gd_data/projects/58a62c7d/genomes/8dc829ec/align.bam
    # urlTemplate = os.path.join(JBROWSE_DATA_URL_ROOT,
    #         bam_dataset.filesystem_location)

    # NOTE: We should construct bam file urls using project.get_client
    # jbrowse_link() rather than checking S3 flag here.
    reference_genome = contig.parent_reference_genome
    if reference_genome.project.is_s3_backed():
        urlTemplate = os.path.join(
            'http://%s.s3.amazonaws.com/' % S3_BUCKET,
            bam_dataset.filesystem_location.strip("/jbrowse"))
    else:
        urlTemplate = os.path.join(JBROWSE_DATA_URL_ROOT,
                                   bam_dataset.filesystem_location)

    # doing label as ES_AG because SA isn't currently used in the variant view
    label = bam_dataset.internal_string(contig)

    key = bam_dataset.external_string(contig)

    # Build the JSON object.
    raw_dict_obj = {
        'tracks': [{
            'storeClass': 'JBrowse/Store/SeqFeature/BAM',
            'urlTemplate': urlTemplate,
            'label': label,
            'type': 'JBrowse/View/Track/Alignments2',
            'chunkSizeLimit': 10000000,  # double the default chunk size
            'key': key,
            'category': 'Contig BAM Tracks',
            'style': {
                'className': 'alignment',
                'arrowheadClass': 'arrowhead',
                'labelScale': 100
            }
        }]
    }
    write_tracklist_json(reference_genome, raw_dict_obj, label)

    # Also add a snp coverage track.
    snp_coverage_label = bam_dataset.internal_string(contig) + '_COVERAGE'

    snp_coverage_key = key + ' Coverage'
    coverage_raw_dict_obj = {
        'tracks': [{
            'storeClass': 'JBrowse/Store/SeqFeature/BAM',
            'urlTemplate': urlTemplate,
            'label': snp_coverage_label,
            'type': 'JBrowse/View/Track/SNPCoverage',
            'category': 'Contig Coverage Tracks',
            'key': snp_coverage_key
        }]
    }
    write_tracklist_json(reference_genome, coverage_raw_dict_obj,
                         snp_coverage_label)
예제 #55
0
def build_snpeff(ref_genome):
    """Setup the SnpEff database for ref_genome.

    This function does the following:
        * Sets up the directory structure for SnpEff-related files.
        * Writes a possibly modified Genbank to the location that SnpEff
              expects to find it. A few cleanups are necessary to avoid SnpEff
              quirks.
        * Creates the SnpEff config file for building the database/index.
        * Builds the SnpEff database/index.

    SnpEFF needs a config file for every reference genome, which lists a
    single reference genome, its chromosomes, and the codon table that
    each uses. For now we can assume that all our genomes will use bacterial
    codons. Every reference genome in the config file should look similar to:

    # Ecoli K12 MG1655
    NC_000913.genome : Escherichia_coli
        NC_000913.chromosomes : NC_000913
        NC_000913.NC_000913.codonTable: Bacterial_and_Plant_Plastid

    We have made a template that can do this with yaml rendering, in the
    snpEFF tools directory. Given a ref_genome object, it generates a
    snpEFF config file and builds and snpEFF database file for the genome,
    and places it in the ref genome's data dir under ./snpeff.
    """

    # if no genbank file for this ref genome, then do nothing
    if not ref_genome.is_annotated():
        print "Skipping SnpEff indexing: No genbank for reference genome %s" % (
            ref_genome.uid)
        return

    # Get the path to the reference genbank, making sure it exists.
    ref_genome_path = get_dataset_with_type(
        ref_genome,
        type=Dataset.TYPE.REFERENCE_GENOME_GENBANK).get_absolute_location()
    assert ref_genome_path is not None, "Reference Genbank missing."

    # Create the snpeff directory structure.
    ref_genome.ensure_snpeff_dir()

    # Build a template data dictionary which will be passed to the django
    # template renderer in order to generate the config file.
    templ_data = {}
    templ_data['snpeff_dir'] = ref_genome.get_snpeff_dir()
    templ_data['uid'] = ref_genome.uid
    templ_data['label'] = ref_genome.label

    # The following block does 2 things:
    #    1. Identifies all chromosomes in the Genbank.
    #    2. Ensures that the contained SeqRecord name and ids match, which is
    #       required by SnpEff.
    templ_data['chromosomes'] = []
    new_genbank_seq_records = []
    with open(ref_genome_path) as genbank_fh:
        for seq_record in SeqIO.parse(genbank_fh, 'genbank'):
            # Set the ACCESSION/LOCUS/VERSION to all be the same for this
            # new modified genbank
            seq_record.name = seq_record.id
            new_genbank_seq_records.append(seq_record)

            # Add this record as a chromosome to this ref genome
            # TODO: Do we want to check seqrecords for sane/sanitized names?
            templ_data['chromosomes'].append(seq_record.name)

    templ_data['chromosomes'].append(seq_record.name)
    templ_data['chrs_string'] = ','.join(templ_data['chromosomes'])

    # Write the updated Genbank.
    snpeff_genbank_path = ref_genome.get_snpeff_genbank_file_path()
    SeqIO.write(new_genbank_seq_records, snpeff_genbank_path, 'genbank')

    # Stop-gap fix to ensure line lengths in Genbank to appease SnpEff.
    ensure_line_lengths(ref_genome.get_snpeff_genbank_file_path())

    # Render SnpEff config template.
    render_snpeff_config(templ_data, ref_genome.get_snpeff_config_path())

    # Build snpEff database
    build_snpeff_db(ref_genome.get_snpeff_config_path(), ref_genome.uid)
예제 #56
0
def run_snpeff(alignment_group, vcf_source_tool):
    """Run snpeff on an alignment group after creating a vcf with a snpcaller.

    We only use the alignment type to store the snpeff file.

    Returns the snpeff vcf output filename.
    """
    assert vcf_source_tool in MAP_VCF_SOURCE_TOOL_TO_ORIGINAL_VCF_DATASET_TYPE

    # Get the reference genome uid to get the config path and snpeff genome name
    ref_genome = alignment_group.reference_genome
    ref_genome_uid = alignment_group.reference_genome.uid

    source_vcf_dataset_type = (
        MAP_VCF_SOURCE_TOOL_TO_ORIGINAL_VCF_DATASET_TYPE[vcf_source_tool])
    source_vcf_dataset = get_dataset_with_type(alignment_group,
                                               type=source_vcf_dataset_type)
    assert source_vcf_dataset is not None
    vcf_input_filename = source_vcf_dataset.get_absolute_location()
    assert os.path.exists(vcf_input_filename)

    # Make sure vcf has at least one record. If not, return.
    with open(vcf_input_filename) as unannotated_fh:
        vcf_reader = vcf.Reader(unannotated_fh)
        try:
            vcf_reader.next()
        except StopIteration:
            # No variants called. No need to do SnpEff.
            return

    # Prepare a directory to put the output file.
    vcf_output_filename = get_snpeff_vcf_output_path(alignment_group,
                                                     vcf_source_tool)

    snpeff_args = [
        'java',
        '-jar',
        settings.SNPEFF_JAR_PATH,
        'eff',
        '-v',
        '-i',
        'vcf',
        '-o',
        'vcf',
        '-c',
        ref_genome.get_snpeff_config_path(),
        '-ud',
        str(settings.SNPEFF_UD_INTERVAL_LENGTH),
        '-formatEff',
        '-q',
        '-noLog',
        #        '-t', str(settings.SNPEFF_THREADS),
        ref_genome_uid,
        vcf_input_filename
    ]

    print ' '.join(snpeff_args)

    with open(vcf_output_filename, 'w') as fh_out:
        snpeff_proc = subprocess.Popen(snpeff_args, stdout=subprocess.PIPE)
        convert_snpeff_info_fields(snpeff_proc.stdout, fh_out)

    return vcf_output_filename
예제 #57
0
def parse_alignment_group_vcf(alignment_group, vcf_dataset_type):
    """Parses the VCF associated with the AlignmentGroup and saves data there.
    """
    vcf_dataset = get_dataset_with_type(alignment_group, vcf_dataset_type)
    parse_vcf(vcf_dataset, alignment_group)
예제 #58
0
def run_delly(fasta_ref, sample_alignments, vcf_output_dir,
              vcf_output_filename, alignment_type, **kwargs):
    """Run delly to find SVs."""
    assert os.path.exists(
        settings.DELLY_BIN), ('Delly is not installed. Aborting.')

    delly_root = vcf_output_filename[:-4]  # get rid of .vcf extension
    transformations = ['DEL', 'DUP', 'INV']
    vcf_outputs = map(
        lambda transformation: '%s_%s.vcf' % (delly_root, transformation),
        transformations)

    # Create symlinks to bam files which use uid because Delly uses the name of
    # the file as sample uid in the output report.
    new_bam_files = []
    bam_files = [
        get_dataset_with_type(sa, alignment_type).get_absolute_location()
        for sa in sample_alignments
    ]
    samples = [sa.experiment_sample for sa in sample_alignments]
    for bam_file, sample in zip(bam_files, samples):
        new_bam_file = os.path.join(os.path.dirname(bam_file),
                                    sample.uid + '.bam')
        _clean_symlink(bam_file, new_bam_file)
        _clean_symlink(bam_file + '.bai', new_bam_file + '.bai')
        new_bam_files.append(new_bam_file)

    # run delly for each type of transformation
    for transformation, vcf_output in zip(transformations, vcf_outputs):

        # not checked_call, because delly errors if it doesn't find any SVs
        subprocess.call([
            settings.DELLY_BIN, '-t', transformation, '-o', vcf_output, '-g',
            fasta_ref
        ] + new_bam_files)

    # combine the separate vcfs for each transformation
    vcf_outputs = [f for f in vcf_outputs if os.path.exists(f)]
    if vcf_outputs:
        temp_vcf = os.path.join(vcf_output_dir, 'temp_vcf')
        os.putenv('PERL5LIB', os.path.join(settings.VCFTOOLS_DIR, 'perl'))
        with open(temp_vcf, 'w') as fh:
            subprocess.check_call([settings.VCF_CONCAT_BINARY] + vcf_outputs,
                                  stdout=fh)
        with open(vcf_output_filename, 'w') as fh:
            subprocess.check_call([settings.VCF_SORT_BINARY, temp_vcf],
                                  stdout=fh)
        os.remove(temp_vcf)
    else:
        # hack: create empty vcf
        subprocess.check_call(['touch', delly_root])
        subprocess.check_call([
            '%s/pindel/pindel2vcf' % settings.TOOLS_DIR,
            '-p',
            delly_root,  # TODO does this work?
            '-r',
            fasta_ref,
            '-R',
            'name',
            '-d',
            'date'
        ])

    # Delete temporary bam file symlinks.
    for f in new_bam_files:
        os.remove(f)
        os.remove(f + '.bai')

    postprocess_delly_vcf(vcf_output_filename)

    return True  # success