def add_vcf_dataset(alignment_group, vcf_dataset_type, vcf_output_filename):
    """
    Sort the vcf file, and create a vcf dataset, Add it to the alignment group.
    """
    sort_vcf(vcf_output_filename)

    # If a Dataset already exists, delete it, might have been a bad run.
    existing_set = Dataset.objects.filter(
            type=vcf_dataset_type,
            label=vcf_dataset_type,
            filesystem_location=clean_filesystem_location(
                    vcf_output_filename)
    )

    if len(existing_set) > 0:
        existing_set[0].delete()

    vcf_dataset = Dataset.objects.create(
            type=vcf_dataset_type,
            label=vcf_dataset_type,
            filesystem_location=clean_filesystem_location(
                    vcf_output_filename),
    )
    alignment_group.dataset_set.add(vcf_dataset)

    return vcf_dataset
예제 #2
0
def add_vcf_dataset(alignment_group, vcf_dataset_type, vcf_output_filename):
    """Sort vcf file, creates vcf dataset, and adds it to the alignment group.
    """
    if not os.path.exists(vcf_output_filename):
        return None

    sort_vcf(vcf_output_filename)

    # If a Dataset already exists, delete it, might have been a bad run.
    existing_set = Dataset.objects.filter(
        type=vcf_dataset_type,
        label=vcf_dataset_type,
        filesystem_location=clean_filesystem_location(vcf_output_filename))

    if len(existing_set) > 0:
        existing_set[0].delete()

    vcf_dataset = Dataset.objects.create(
        type=vcf_dataset_type,
        label=vcf_dataset_type,
        filesystem_location=clean_filesystem_location(vcf_output_filename),
    )
    alignment_group.dataset_set.add(vcf_dataset)

    return vcf_dataset
예제 #3
0
    def test_dataset_compression_piping(self):
        """
        Make sure data set compression behaves correctly.
        """
        dataset = Dataset.objects.create(label='test_dataset',
                                         type=Dataset.TYPE.FASTQ1)

        GZIPPED_FASTQ_FILEPATH = os.path.join(settings.PWD, 'test_data',
                                              'compressed_fastq',
                                              'sample0.simLibrary.1.fq.gz')

        dataset.filesystem_location = clean_filesystem_location(
            GZIPPED_FASTQ_FILEPATH)

        assert dataset.is_compressed()

        process = subprocess.Popen(
            ('head ' + dataset.wrap_if_compressed() + ' | wc -l'),
            shell=True,
            executable=settings.BASH_PATH,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE)

        wc_output, errmsg = process.communicate()
        rc = process.returncode

        assert rc == 0, (
            "Compression process returned non-zero exit status: %s" % (errmsg))

        assert int(wc_output) == 10, ("Compression failed: %s" % (errmsg))
예제 #4
0
 def test_clean_filesystem_location(self):
     FAKE_ABS_ROOT = '/root/of/all/evil'
     EXPECTED_CLEAN_URL = 'projects/blah'
     dirty_full_url = os.path.join(FAKE_ABS_ROOT, settings.MEDIA_ROOT,
                                   EXPECTED_CLEAN_URL)
     clean_location = clean_filesystem_location(dirty_full_url)
     self.assertEqual(EXPECTED_CLEAN_URL, clean_location)
예제 #5
0
 def test_clean_filesystem_location(self):
     FAKE_ABS_ROOT = '/root/of/all/evil'
     EXPECTED_CLEAN_URL = 'projects/blah'
     dirty_full_url = os.path.join(FAKE_ABS_ROOT, settings.MEDIA_ROOT,
             EXPECTED_CLEAN_URL)
     clean_location = clean_filesystem_location(dirty_full_url)
     self.assertEqual(EXPECTED_CLEAN_URL, clean_location)
    def test_multiple_chromosome_dataset_import(self):
        user = User.objects.create_user(
            TEST_USERNAME, password=TEST_PASSWORD, email=TEST_EMAIL)

        project = Project.objects.create(
            title=TEST_PROJECT_NAME, owner=user.get_profile())

        test_yeast_genome = ReferenceGenome.objects.create(
            project=project,
            label='superbrewer2000')

        test_dataset_path = os.path.join(settings.PWD, 'test_data/yeast_chrom_jkl.fasta')
        dataset_path = copy_dataset_to_entity_data_dir(test_yeast_genome, test_dataset_path)

        test_chroms_dataset  = Dataset.objects.create(
            label='jkl_chroms',
            type=Dataset.TYPE.REFERENCE_GENOME_FASTA,
            filesystem_location=clean_filesystem_location(dataset_path))

        test_yeast_genome.dataset_set.add(test_chroms_dataset)

        # Assert correct number of chromosomes
        assert(test_yeast_genome.num_chromosomes == 3)

        # Assert correct number of bases
        assert(test_yeast_genome.num_bases == sum([chrom.num_bases for chrom in
                Chromosome.objects.filter(reference_genome=test_yeast_genome)]))

        # Assert correct chromosome labels
        expected_chrom_names = [
                'gi|448092123|ref|NC_020215.1|',
                'gi|448096713|ref|NC_020216.1|',
                'gi|448100869|ref|NC_020217.1|']

        assert([chrom.label for chrom in Chromosome.objects.filter(reference_genome=test_yeast_genome)] == expected_chrom_names)
예제 #7
0
    def test_dataset_compression_piping(self):
        """
        Make sure data set compression behaves correctly.
        """
        dataset = Dataset.objects.create(
                label='test_dataset',
                type=Dataset.TYPE.FASTQ1)

        GZIPPED_FASTQ_FILEPATH = os.path.join(settings.PWD, 'test_data',
                'compressed_fastq', 'sample0.simLibrary.1.fq.gz')

        dataset.filesystem_location = clean_filesystem_location(
                    GZIPPED_FASTQ_FILEPATH)

        assert dataset.is_compressed()

        process = subprocess.Popen(
                ('head '+dataset.wrap_if_compressed()+' | wc -l'),
                shell=True, executable=settings.BASH_PATH, stdout=subprocess.PIPE,
                stderr=subprocess.PIPE)

        wc_output, errmsg = process.communicate()
        rc = process.returncode

        assert rc == 0, (
        "Compression process returned non-zero exit status: %s" % (
                errmsg))

        assert int(wc_output) == 10, (
                "Compression failed: %s" % (errmsg))
예제 #8
0
def compute_callable_loci(reference_genome, sample_alignment,
            bam_file_location, stderr=None):

    # Set output fn to None in case try fails.
    callable_loci_bed_fn = None

    try:
        ref_genome_fasta_location = get_dataset_with_type(
                reference_genome,
                Dataset.TYPE.REFERENCE_GENOME_FASTA).get_absolute_location()

        output = _get_callable_loci_output_filename(bam_file_location)

        get_callable_loci(bam_file_location, output)

        # Add callable loci bed as dataset
        callable_loci_bed = Dataset.objects.create(
                label=Dataset.TYPE.BED_CALLABLE_LOCI,
                type=Dataset.TYPE.BED_CALLABLE_LOCI,
                filesystem_location=clean_filesystem_location(output))

        sample_alignment.dataset_set.add(callable_loci_bed)
        sample_alignment.save()

        callable_loci_bed_fn = callable_loci_bed.get_absolute_location()

        output = subprocess.check_output(
                ['cat', callable_loci_bed_fn])

        with open(callable_loci_bed_fn, 'w') as callable_loci_bed_fh:
            for i, line in enumerate(output.split('\n')):
                try:
                    fields = line.split()
                    if len(fields) == 0:
                        continue
                    chrom, start, end, feature = fields

                    feature = titlecase_spaces(feature)
                    # Bed feature can't have spaces =(
                    feature = feature.replace(' ', '_')

                    print >> callable_loci_bed_fh, '\t'.join(
                            [chrom, start, end, feature])
                except Exception as e:
                    print >> stderr, (
                        'WARNING: Callable Loci line' +
                        '%d: (%s) couldn\'t be parsed: %s') % (
                                i, line, str(e))

        # add it as a jbrowse track
        add_bed_file_track(reference_genome, sample_alignment, callable_loci_bed)

    except Exception as e:
        print >> stderr, 'WARNING: Callable Loci failed.'
        print >> stderr, str(e)

    return callable_loci_bed_fn
예제 #9
0
def get_split_reads(sample_alignment):
    """Isolate split reads from a sample alignment.

    This uses a python script supplied with Lumpy that is run as a
    separate process.

    NOTE THAT THIS SCRIPT ONLY WORKS WITH BWA MEM.
    """
    bwa_split_dataset = get_dataset_with_type(sample_alignment,
                                              Dataset.TYPE.BWA_SPLIT)
    if bwa_split_dataset is not None:
        if (bwa_split_dataset.status == Dataset.STATUS.READY
                and os.path.exists(bwa_split_dataset.get_absolute_location())):
            return bwa_split_dataset
    else:
        bwa_split_dataset = Dataset.objects.create(
            label=Dataset.TYPE.BWA_SPLIT,
            type=Dataset.TYPE.BWA_SPLIT,
            status=Dataset.STATUS.NOT_STARTED)
        sample_alignment.dataset_set.add(bwa_split_dataset)

    # If here, we are going to run or re-run the Dataset.
    bwa_split_dataset.status = Dataset.STATUS.NOT_STARTED
    bwa_split_dataset.save(update_fields=['status'])

    bam_dataset = get_dataset_with_type(sample_alignment,
                                        Dataset.TYPE.BWA_ALIGN)
    bam_filename = bam_dataset.get_absolute_location()

    assert os.path.exists(
        bam_filename), "BAM file '%s' is missing." % (bam_filename)

    bam_split_filename = os.path.join(sample_alignment.get_model_data_dir(),
                                      'bwa_split_reads.bam')

    try:
        bwa_split_dataset.status = Dataset.STATUS.COMPUTING
        bwa_split_dataset.save(update_fields=['status'])
        extract_split_reads(bam_filename, bam_split_filename)

    except subprocess.CalledProcessError:
        # if there are no split reads, then fail.
        bwa_split_dataset.filesystem_location = ''
        bwa_split_dataset.status = Dataset.STATUS.FAILED
    finally:
        bwa_split_dataset.status = Dataset.STATUS.READY
        bwa_split_dataset.filesystem_location = clean_filesystem_location(
            bam_split_filename)

    bwa_split_dataset.save()

    return bwa_split_dataset
예제 #10
0
def get_split_reads(sample_alignment):
    """Isolate split reads from a sample alignment.

    This uses a python script supplied with Lumpy that is run as a
    separate process.

    NOTE THAT THIS SCRIPT ONLY WORKS WITH BWA MEM.
    """
    bwa_split_dataset = get_dataset_with_type(
            sample_alignment, Dataset.TYPE.BWA_SPLIT)
    if bwa_split_dataset is not None:
        if (bwa_split_dataset.status == Dataset.STATUS.READY and
                os.path.exists(bwa_split_dataset.get_absolute_location())):
            return bwa_split_dataset
    else:
        bwa_split_dataset = Dataset.objects.create(
                label=Dataset.TYPE.BWA_SPLIT,
                type=Dataset.TYPE.BWA_SPLIT,
                status=Dataset.STATUS.NOT_STARTED)
        sample_alignment.dataset_set.add(bwa_split_dataset)

    # If here, we are going to run or re-run the Dataset.
    bwa_split_dataset.status = Dataset.STATUS.NOT_STARTED
    bwa_split_dataset.save(update_fields=['status'])

    bam_dataset = get_dataset_with_type(sample_alignment, Dataset.TYPE.BWA_ALIGN)
    bam_filename = bam_dataset.get_absolute_location()

    assert os.path.exists(bam_filename), "BAM file '%s' is missing." % (
            bam_filename)

    bam_split_filename = os.path.join(sample_alignment.get_model_data_dir(),
            'bwa_split_reads.bam')

    try:
        bwa_split_dataset.status = Dataset.STATUS.COMPUTING
        bwa_split_dataset.save(update_fields=['status'])
        extract_split_reads(bam_filename, bam_split_filename)

    except subprocess.CalledProcessError:
        # if there are no split reads, then fail.
        bwa_split_dataset.filesystem_location = ''
        bwa_split_dataset.status = Dataset.STATUS.FAILED
    finally:
        bwa_split_dataset.status = Dataset.STATUS.READY
        bwa_split_dataset.filesystem_location = clean_filesystem_location(
                bam_split_filename)


    bwa_split_dataset.save()

    return bwa_split_dataset
예제 #11
0
def get_discordant_read_pairs(sample_alignment):
    """Isolate discordant pairs of reads from a sample alignment.
    """
    # First, check if completed dataset already exists.
    bwa_disc_dataset = get_dataset_with_type(sample_alignment,
                                             Dataset.TYPE.BWA_DISCORDANT)
    if bwa_disc_dataset is not None:
        if (bwa_disc_dataset.status == Dataset.STATUS.READY
                and os.path.exists(bwa_disc_dataset.get_absolute_location())):
            return bwa_disc_dataset
    else:
        bwa_disc_dataset = Dataset.objects.create(
            label=Dataset.TYPE.BWA_DISCORDANT,
            type=Dataset.TYPE.BWA_DISCORDANT)
        sample_alignment.dataset_set.add(bwa_disc_dataset)

    # If here, we are going to run or re-run the Dataset.
    bwa_disc_dataset.status = Dataset.STATUS.NOT_STARTED
    bwa_disc_dataset.save(update_fields=['status'])

    bam_dataset = get_dataset_with_type(sample_alignment,
                                        Dataset.TYPE.BWA_ALIGN)
    bam_filename = bam_dataset.get_absolute_location()

    assert os.path.exists(
        bam_filename), "BAM file '%s' is missing." % (bam_filename)

    # NOTE: This assumes the index just adds at .bai, w/ same path otherwise
    # - will this always be true?
    if not os.path.exists(bam_filename + '.bai'):
        index_bam_file(bam_filename)

    bam_discordant_filename = os.path.join(
        sample_alignment.get_model_data_dir(), 'bwa_discordant_pairs.bam')

    try:
        bwa_disc_dataset.status = Dataset.STATUS.COMPUTING
        bwa_disc_dataset.save(update_fields=['status'])
        extract_discordant_read_pairs(bam_filename, bam_discordant_filename)

    except subprocess.CalledProcessError:
        bwa_disc_dataset.filesystem_location = ''
        bwa_disc_dataset.status = Dataset.STATUS.FAILED
    finally:
        bwa_disc_dataset.status = Dataset.STATUS.READY
        bwa_disc_dataset.filesystem_location = clean_filesystem_location(
            bam_discordant_filename)

    bwa_disc_dataset.save()

    return bwa_disc_dataset
예제 #12
0
def get_discordant_read_pairs(sample_alignment):
    """Isolate discordant pairs of reads from a sample alignment.
    """
    # First, check if completed dataset already exists.
    bwa_disc_dataset = get_dataset_with_type(
            sample_alignment, Dataset.TYPE.BWA_DISCORDANT)
    if bwa_disc_dataset is not None:
        if (bwa_disc_dataset.status == Dataset.STATUS.READY and
                os.path.exists(bwa_disc_dataset.get_absolute_location())):
            return bwa_disc_dataset
    else:
        bwa_disc_dataset = Dataset.objects.create(
                label=Dataset.TYPE.BWA_DISCORDANT,
                type=Dataset.TYPE.BWA_DISCORDANT)
        sample_alignment.dataset_set.add(bwa_disc_dataset)

    # If here, we are going to run or re-run the Dataset.
    bwa_disc_dataset.status = Dataset.STATUS.NOT_STARTED
    bwa_disc_dataset.save(update_fields=['status'])

    bam_dataset = get_dataset_with_type(sample_alignment, Dataset.TYPE.BWA_ALIGN)
    bam_filename = bam_dataset.get_absolute_location()

    assert os.path.exists(bam_filename), "BAM file '%s' is missing." % (
            bam_filename)

    # NOTE: This assumes the index just adds at .bai, w/ same path otherwise
    # - will this always be true?
    if not os.path.exists(bam_filename+'.bai'):
        index_bam_file(bam_filename)

    bam_discordant_filename = os.path.join(sample_alignment.get_model_data_dir(),
            'bwa_discordant_pairs.bam')

    try:
        bwa_disc_dataset.status = Dataset.STATUS.COMPUTING
        bwa_disc_dataset.save(update_fields=['status'])
        extract_discordant_read_pairs(bam_filename, bam_discordant_filename)

    except subprocess.CalledProcessError:
        bwa_disc_dataset.filesystem_location = ''
        bwa_disc_dataset.status = Dataset.STATUS.FAILED
    finally:
        bwa_disc_dataset.status = Dataset.STATUS.READY
        bwa_disc_dataset.filesystem_location = clean_filesystem_location(
                bam_discordant_filename)

    bwa_disc_dataset.save()

    return bwa_disc_dataset
예제 #13
0
def add_dataset_to_entity(entity, dataset_label, dataset_type,
        filesystem_location=None):
    """Helper function for adding a Dataset to a model.
    """
    dataset = Dataset.objects.create(
            label=dataset_label, type=dataset_type)

    if filesystem_location is not None:
        dataset.filesystem_location = clean_filesystem_location(
                filesystem_location)
        dataset.save()

    entity.dataset_set.add(dataset)
    entity.save()

    return dataset
예제 #14
0
def add_dataset_to_entity(entity, dataset_label, dataset_type,
        filesystem_location=None):
    """Helper function for adding a Dataset to a model.
    """
    dataset = Dataset.objects.create(
            label=dataset_label, type=dataset_type)

    if filesystem_location is not None:
        dataset.filesystem_location = clean_filesystem_location(
                filesystem_location)
        dataset.save()

    entity.dataset_set.add(dataset)
    entity.save()

    return dataset
예제 #15
0
    def derivation_fn(sample_alignment, unmapped_reads_dataset):
        # Get the original bam file.
        bam_dataset = get_dataset_with_type(sample_alignment,
                                            Dataset.TYPE.BWA_ALIGN)
        bam_filename = bam_dataset.get_absolute_location()

        # Allocate a filename for the unmapped reads.
        unmapped_reads_bam_file = (os.path.splitext(bam_filename)[0] +
                                   '.unmapped.bam')
        unmapped_reads_dataset.filesystem_location = clean_filesystem_location(
            unmapped_reads_bam_file)
        unmapped_reads_dataset.save(update_fields=['filesystem_location'])

        cmd = '{samtools} view -h -b -f 0x4 {bam_filename}'.format(
            samtools=settings.SAMTOOLS_BINARY, bam_filename=bam_filename)
        with open(unmapped_reads_bam_file, 'w') as output_fh:
            subprocess.check_call(cmd, stdout=output_fh, shell=True)
예제 #16
0
    def derivation_fn(sample_alignment, unmapped_reads_dataset):
        # Get the original bam file.
        bam_dataset = get_dataset_with_type(sample_alignment,
                Dataset.TYPE.BWA_ALIGN)
        bam_filename = bam_dataset.get_absolute_location()

        # Allocate a filename for the unmapped reads.
        unmapped_reads_bam_file = (os.path.splitext(bam_filename)[0] +
                '.unmapped.bam')
        unmapped_reads_dataset.filesystem_location = clean_filesystem_location(
                unmapped_reads_bam_file)
        unmapped_reads_dataset.save(update_fields=['filesystem_location'])

        cmd = '{samtools} view -h -b -f 0x4 {bam_filename}'.format(
                samtools=settings.SAMTOOLS_BINARY,
                bam_filename=bam_filename)
        with open(unmapped_reads_bam_file, 'w') as output_fh:
           subprocess.check_call(cmd, stdout=output_fh, shell=True)
def main():
    # Create a User and Project.
    user = get_or_create_user()
    test_project = Project.objects.create(title=EXAMPLE_PROJECT_NAME,
                                          owner=user.get_profile())
    ref_genome = import_reference_genome_from_local_file(test_project,
                                                         'mg1655',
                                                         MG1655_REF_GENOME,
                                                         'genbank',
                                                         move=False)

    # Create alignment group and and relate the vcf Dataset to it.
    alignment_group = AlignmentGroup.objects.create(
        label='Fix Recoli Alignment',
        reference_genome=ref_genome,
        aligner=AlignmentGroup.ALIGNER.BWA)
    vcf_output_path = get_snpeff_vcf_output_path(alignment_group,
                                                 Dataset.TYPE.BWA_ALIGN)
    shutil.copy(LARGE_VCF, vcf_output_path)
    dataset = Dataset.objects.create(
        type=Dataset.TYPE.VCF_FREEBAYES_SNPEFF,
        label=Dataset.TYPE.VCF_FREEBAYES_SNPEFF,
        filesystem_location=clean_filesystem_location(vcf_output_path),
    )
    alignment_group.dataset_set.add(dataset)

    # Import ExperimentSampleo objects, setting specific uid to match
    # the vcf file.
    with open(EXPERIMENT_SAMPLE_MODEL_DATA_PICKLE) as sample_data_fh:
        es_data = pickle.load(sample_data_fh)
        for es in es_data:
            es_obj = ExperimentSample.objects.create(uid=es.uid,
                                                     project=test_project,
                                                     label=es.label)

            es_obj.data.update({
                'group': es.group,
                'well': es.well,
                'num_reads': es.num_reads
            })
            es_obj.save()

    parse_alignment_group_vcf(alignment_group,
                              Dataset.TYPE.VCF_FREEBAYES_SNPEFF)
def main():
    # Create a User and Project.
    user = get_or_create_user()
    test_project = Project.objects.create(
            title=EXAMPLE_PROJECT_NAME, owner=user.get_profile())
    ref_genome = import_reference_genome_from_local_file(test_project,
            'mg1655', MG1655_REF_GENOME, 'genbank', move=False)

    # Create alignment group and and relate the vcf Dataset to it.
    alignment_group = AlignmentGroup.objects.create(
            label='Fix Recoli Alignment',
            reference_genome=ref_genome,
            aligner=AlignmentGroup.ALIGNER.BWA)
    vcf_output_path = get_snpeff_vcf_output_path(alignment_group,
            Dataset.TYPE.BWA_ALIGN)
    shutil.copy(LARGE_VCF, vcf_output_path)
    dataset = Dataset.objects.create(
            type=Dataset.TYPE.VCF_FREEBAYES_SNPEFF,
            label=Dataset.TYPE.VCF_FREEBAYES_SNPEFF,
            filesystem_location=clean_filesystem_location(vcf_output_path),
    )
    alignment_group.dataset_set.add(dataset)

    # Import ExperimentSampleo objects, setting specific uid to match
    # the vcf file.
    with open(EXPERIMENT_SAMPLE_MODEL_DATA_PICKLE) as sample_data_fh:
        es_data = pickle.load(sample_data_fh)
        for es in es_data:
            es_obj = ExperimentSample.objects.create(
                uid=es.uid,
                project=test_project,
                label=es.label
            )

            es_obj.data.update(
                {'group':es.group,
                 'well':es.well,
                 'num_reads':es.num_reads})
            es_obj.save()

    parse_alignment_group_vcf(alignment_group,
            Dataset.TYPE.VCF_FREEBAYES_SNPEFF)
예제 #19
0
def compute_callable_loci(reference_genome,
                          sample_alignment,
                          bam_file_location,
                          stderr=None):

    # Set output fn to None in case try fails.
    callable_loci_bed_fn = None

    try:
        ref_genome_fasta_location = get_dataset_with_type(
            reference_genome,
            Dataset.TYPE.REFERENCE_GENOME_FASTA).get_absolute_location()

        callable_loci_bed_fn = (
            _get_callable_loci_output_filename(bam_file_location))

        get_callable_loci(bam_file_location, callable_loci_bed_fn)

        # Add callable loci bed as dataset
        callable_loci_bed_dataset = Dataset.objects.create(
            label=Dataset.TYPE.BED_CALLABLE_LOCI,
            type=Dataset.TYPE.BED_CALLABLE_LOCI,
            filesystem_location=clean_filesystem_location(
                callable_loci_bed_fn))

        sample_alignment.dataset_set.add(callable_loci_bed_dataset)
        sample_alignment.save()

        clean_bed_fn = clean_bed_features(callable_loci_bed_dataset,
                                          stderr=stderr)

        # add it as a jbrowse track
        add_bed_file_track(reference_genome, sample_alignment,
                           callable_loci_bed_dataset)

    except Exception as e:
        print >> stderr, 'WARNING: Callable Loci failed.'
        print >> stderr, str(e)
        clean_bed_fn = ''

    finally:
        return clean_bed_fn
예제 #20
0
def compute_callable_loci(reference_genome, sample_alignment,
            bam_file_location, stderr=None):

    # Set output fn to None in case try fails.
    callable_loci_bed_fn = None

    try:
        ref_genome_fasta_location = get_dataset_with_type(
                reference_genome,
                Dataset.TYPE.REFERENCE_GENOME_FASTA).get_absolute_location()

        callable_loci_bed_fn = (
                _get_callable_loci_output_filename(bam_file_location))

        get_callable_loci(bam_file_location, callable_loci_bed_fn)

        # Add callable loci bed as dataset
        callable_loci_bed_dataset = Dataset.objects.create(
                label=Dataset.TYPE.BED_CALLABLE_LOCI,
                type=Dataset.TYPE.BED_CALLABLE_LOCI,
                filesystem_location=clean_filesystem_location(callable_loci_bed_fn))

        sample_alignment.dataset_set.add(callable_loci_bed_dataset)
        sample_alignment.save()

        clean_bed_fn = clean_bed_features(callable_loci_bed_dataset, stderr=stderr)

        # add it as a jbrowse track
        add_bed_file_track(
                reference_genome,
                sample_alignment,
                callable_loci_bed_dataset)


    except Exception as e:
        print >> stderr, 'WARNING: Callable Loci failed.'
        print >> stderr, str(e)
        clean_bed_fn = ''

    finally:
        return clean_bed_fn
예제 #21
0
    def test_multiple_chromosome_dataset_import(self):
        user = User.objects.create_user(TEST_USERNAME,
                                        password=TEST_PASSWORD,
                                        email=TEST_EMAIL)

        project = Project.objects.create(title=TEST_PROJECT_NAME,
                                         owner=user.get_profile())

        test_yeast_genome = ReferenceGenome.objects.create(
            project=project, label='superbrewer2000')

        test_dataset_path = os.path.join(settings.PWD,
                                         'test_data/yeast_chrom_jkl.fasta')
        dataset_path = copy_dataset_to_entity_data_dir(test_yeast_genome,
                                                       test_dataset_path)

        test_chroms_dataset = Dataset.objects.create(
            label='jkl_chroms',
            type=Dataset.TYPE.REFERENCE_GENOME_FASTA,
            filesystem_location=clean_filesystem_location(dataset_path))

        test_yeast_genome.dataset_set.add(test_chroms_dataset)

        # Assert correct number of chromosomes
        assert (test_yeast_genome.num_chromosomes == 3)

        # Assert correct number of bases
        assert (test_yeast_genome.num_bases == sum([
            chrom.num_bases for chrom in Chromosome.objects.filter(
                reference_genome=test_yeast_genome)
        ]))

        # Assert correct chromosome labels
        expected_chrom_names = [
            'gi|448092123|ref', 'gi|448096713|ref', 'gi|448100869|ref'
        ]

        assert (set([
            chrom.seqrecord_id for chrom in Chromosome.objects.filter(
                reference_genome=test_yeast_genome)
        ]) == set(expected_chrom_names))
예제 #22
0
def sanitize_sequence_dataset(dataset):

    dataset_type_to_parse_format = {
        Dataset.TYPE.REFERENCE_GENOME_FASTA: 'fasta',
        Dataset.TYPE.REFERENCE_GENOME_GENBANK: 'genbank'
    }

    if dataset.type not in dataset_type_to_parse_format:
        return

    dirty_file_path = dataset.get_absolute_location()
    parse_format = dataset_type_to_parse_format[dataset.type]

    needs_santizing = False
    with open(dirty_file_path, 'r') as dirty_fh:
        for seq_record in SeqIO.parse(dirty_fh, parse_format):
            if len(seq_record.id) > 16:
                needs_santizing = True
                break

    if not needs_santizing:
        return

    prefix, ext = os.path.splitext(dirty_file_path)
    clean_file_path = prefix + '.clean' + ext

    seq_record_list = []
    with open(dirty_file_path, 'r') as dirty_fh:
        for seq_record in SeqIO.parse(dirty_fh, parse_format):
            seq_record.id = seq_record.id[:16]
            seq_record.name = seq_record.id
            seq_record_list.append(seq_record)


    with open(clean_file_path, 'w') as clean_fh:
        SeqIO.write(seq_record_list, clean_fh, parse_format)

    dataset.filesystem_location = clean_filesystem_location(clean_file_path)
    dataset.save()
예제 #23
0
def sanitize_sequence_dataset(dataset):

    dataset_type_to_parse_format = {
        Dataset.TYPE.REFERENCE_GENOME_FASTA: 'fasta',
        Dataset.TYPE.REFERENCE_GENOME_GENBANK: 'genbank'
    }

    if dataset.type not in dataset_type_to_parse_format:
        return

    dirty_file_path = dataset.get_absolute_location()
    parse_format = dataset_type_to_parse_format[dataset.type]

    needs_santizing = False
    with open(dirty_file_path, 'r') as dirty_fh:
        for seq_record in SeqIO.parse(dirty_fh, parse_format):
            if len(seq_record.id) > 16:
                needs_santizing = True
                break

    if not needs_santizing:
        return

    prefix, ext = os.path.splitext(dirty_file_path)
    clean_file_path = prefix + '.clean' + ext

    seq_record_list = []
    with open(dirty_file_path, 'r') as dirty_fh:
        for seq_record in SeqIO.parse(dirty_fh, parse_format):
            seq_record.id = seq_record.id[:16]
            seq_record.name = seq_record.id
            seq_record_list.append(seq_record)


    with open(clean_file_path, 'w') as clean_fh:
        SeqIO.write(seq_record_list, clean_fh, parse_format)

    dataset.filesystem_location = clean_filesystem_location(clean_file_path)
    dataset.save()
예제 #24
0
    def test_run_lumpy(self):
        TEST_SAMPLE_UID = '8c57e7b9'

        # Create a ref genome.
        self.reference_genome = import_reference_genome_from_local_file(
                self.project, 'ref_genome', TEST_FASTA, 'fasta')

        # Create a sample.
        self.experiment_sample = ExperimentSample.objects.create(
                uid=TEST_SAMPLE_UID, project=self.project, label='sample1')

        # Create a new alignment group.
        alignment_group = AlignmentGroup.objects.create(
                label='test alignment', reference_genome=self.reference_genome)

        self.alignment_group = alignment_group

        # Create the expected models.
        sample_alignment = ExperimentSampleToAlignment.objects.create(
                alignment_group=alignment_group,
                experiment_sample=self.experiment_sample)
        bwa_dataset = Dataset.objects.create(
                label=Dataset.TYPE.BWA_ALIGN,
                type=Dataset.TYPE.BWA_ALIGN,
                status=Dataset.STATUS.READY)
        bwa_dataset.filesystem_location = clean_filesystem_location(
                TEST_DISC_SPLIT_BAM)
        bwa_dataset.save()

        sample_alignment.dataset_set.add(bwa_dataset)
        sample_alignment.save()

        self.bwa_dataset = bwa_dataset
        self.sample_alignment = sample_alignment

        fasta_ref = get_dataset_with_type(
            self.reference_genome,
            Dataset.TYPE.REFERENCE_GENOME_FASTA).get_absolute_location()

        sample_alignments = [self.sample_alignment]

        vcf_output_dir = self.alignment_group.get_model_data_dir()

        vcf_output_filename = os.path.join(vcf_output_dir, 'lumpy.vcf')

        alignment_type = 'BWA_ALIGN'

        # NOTE: Running these functions but not checking results.
        get_discordant_read_pairs(self.sample_alignment)
        get_split_reads(self.sample_alignment)

        run_lumpy(fasta_ref, sample_alignments, vcf_output_dir,
                vcf_output_filename, alignment_type)

        dataset = Dataset.objects.create(
                type=Dataset.TYPE.VCF_LUMPY,
                label=Dataset.TYPE.VCF_LUMPY,
                filesystem_location=vcf_output_filename,
        )

        self.alignment_group.dataset_set.add(dataset)

        # Parse the resulting vcf, grab variant objects
        parse_alignment_group_vcf(self.alignment_group, Dataset.TYPE.VCF_LUMPY)

        # Grab the resulting variants.
        variants = Variant.objects.filter(reference_genome=self.reference_genome)

        # There should be a Variant object for each sv event.
        self.assertEqual(2, len(variants))

        # One event should be located very close to 25k
        va_positions = [v.position for v in variants]
        va_offset = [25000 - va_pos for va_pos in va_positions]
        self.assertTrue(any([v < 50 for v in va_offset]))
예제 #25
0
    def derivation_fn(sample_alignment, new_dataset):
        """Creates a bam file with reads relevant for de novo assembly.
        """
        # Get the original bam file.
        bam_dataset = get_dataset_with_type(sample_alignment,
                                            Dataset.TYPE.BWA_ALIGN)
        orig_bam_filename = bam_dataset.get_absolute_location()

        # Allocate a filename for the new dataset.
        de_novo_bam_filelocation = (os.path.splitext(orig_bam_filename)[0] +
                                    '.de_novo.bam')
        new_dataset.filesystem_location = clean_filesystem_location(
            de_novo_bam_filelocation)
        new_dataset.save(update_fields=['filesystem_location'])

        ### Strategy
        # 0. Create intermediate sam
        # 1. Grab unmapped reads.
        #     a. If the corresponding pair for any read was mapped (and thus not
        #        in the unmapped dataset), grab that from the original bam file.
        # 2. Grab split reads.
        # 2b. Add reads that are in intervals of interest.
        # 3. Sort by name so that paired reads are next to each other
        # 4. Filter out duplicate reads. Requires sort in step 3.
        # 5. Convert to bam.
        # 6. Delete intermediate files.

        # 0. Create intermediate files.
        intermediate_sam = (os.path.splitext(de_novo_bam_filelocation)[0] +
                            '.int.sam')
        paired_intermediate_sam = (os.path.splitext(intermediate_sam)[0] +
                                   '.paired.sam')
        sorted_intermediate_sam = (
            os.path.splitext(paired_intermediate_sam)[0] + '.sorted.sam')
        deduped_sorted_intermediate_sam = (
            os.path.splitext(sorted_intermediate_sam)[0] + '.deduped.sam')
        intermediate_files = [
            intermediate_sam, paired_intermediate_sam, sorted_intermediate_sam,
            deduped_sorted_intermediate_sam
        ]

        # 1. Get unmapped reads.
        unmapped_reads_dataset = get_unmapped_reads(sample_alignment)
        unmapped_reads_bam_file = unmapped_reads_dataset.get_absolute_location(
        )
        cmd = '{samtools} view -h {bam_filename}'.format(
            samtools=settings.SAMTOOLS_BINARY,
            bam_filename=unmapped_reads_bam_file)
        with open(intermediate_sam, 'w') as output_fh:
            subprocess.check_call(cmd, stdout=output_fh, shell=True)

        # 2. Grab split reads.
        split_reads_dataset = get_split_reads(sample_alignment)
        split_rads_bam_file = split_reads_dataset.get_absolute_location()
        cmd = '{samtools} view {bam_filename}'.format(
            samtools=settings.SAMTOOLS_BINARY,
            bam_filename=split_rads_bam_file)
        with open(intermediate_sam, 'a') as output_fh:
            subprocess.check_call(cmd, stdout=output_fh, shell=True)

        # 2b. Add reads that are in any of intervals that we want to include.
        if force_include_reads_in_intervals:
            with open(intermediate_sam, 'a') as output_fh:
                get_reads_in_interval_list(orig_bam_filename,
                                           force_include_reads_in_intervals,
                                           output_fh)

        # 2c. For each of the reads that we've included, grab their
        # corresponding pairs.
        add_paired_mates(intermediate_sam, orig_bam_filename,
                         paired_intermediate_sam)

        # 3. Sort by name so that paired reads are next to each other
        cmd = ('(grep ^"@" {sam_file}; grep -v ^"@" {sam_file} | '
               'sort -k1,1 -k2,2n) > {sorted_sam_file}').format(
                   sam_file=paired_intermediate_sam,
                   sorted_sam_file=sorted_intermediate_sam)
        subprocess.call(cmd, shell=True)

        # 4. Filter out duplicate reads. Requires sort in step 3.
        cmd = 'uniq {sorted_sam_file} > {deduped_sam_file}'.format(
            sorted_sam_file=sorted_intermediate_sam,
            deduped_sam_file=deduped_sorted_intermediate_sam)
        subprocess.call(cmd, shell=True)

        # 5. Convert to bam.
        cmd = '{samtools} view -Sb {sam_file} > {final_bam_file}'.format(
            samtools=settings.SAMTOOLS_BINARY,
            sam_file=intermediate_sam,
            final_bam_file=de_novo_bam_filelocation)
        subprocess.call(cmd, shell=True)

        # 6. Delete intermediate files.
        for f in intermediate_files:
            os.remove(f)
예제 #26
0
    def test_run_lumpy(self):
        TEST_SAMPLE_UID = '8c57e7b9'

        # Create a ref genome.
        self.reference_genome = import_reference_genome_from_local_file(
            self.project, 'ref_genome', TEST_FASTA, 'fasta')

        # Create a sample.
        self.experiment_sample = ExperimentSample.objects.create(
            uid=TEST_SAMPLE_UID, project=self.project, label='sample1')

        # Create a new alignment group.
        alignment_group = AlignmentGroup.objects.create(
            label='test alignment', reference_genome=self.reference_genome)

        self.alignment_group = alignment_group

        # Create the expected models.
        sample_alignment = ExperimentSampleToAlignment.objects.create(
            alignment_group=alignment_group,
            experiment_sample=self.experiment_sample)
        bwa_dataset = Dataset.objects.create(label=Dataset.TYPE.BWA_ALIGN,
                                             type=Dataset.TYPE.BWA_ALIGN,
                                             status=Dataset.STATUS.READY)
        bwa_dataset.filesystem_location = clean_filesystem_location(
            TEST_DISC_SPLIT_BAM)
        bwa_dataset.save()

        sample_alignment.dataset_set.add(bwa_dataset)
        sample_alignment.save()

        self.bwa_dataset = bwa_dataset
        self.sample_alignment = sample_alignment

        fasta_ref = get_dataset_with_type(
            self.reference_genome,
            Dataset.TYPE.REFERENCE_GENOME_FASTA).get_absolute_location()

        sample_alignments = [self.sample_alignment]

        vcf_output_dir = self.alignment_group.get_model_data_dir()

        vcf_output_filename = os.path.join(vcf_output_dir, 'lumpy.vcf')

        alignment_type = 'BWA_ALIGN'

        # NOTE: Running these functions but not checking results.
        get_discordant_read_pairs(self.sample_alignment)
        get_split_reads(self.sample_alignment)

        run_lumpy(fasta_ref, sample_alignments, vcf_output_dir,
                  vcf_output_filename, alignment_type)

        dataset = Dataset.objects.create(
            type=Dataset.TYPE.VCF_LUMPY,
            label=Dataset.TYPE.VCF_LUMPY,
            filesystem_location=vcf_output_filename,
        )

        self.alignment_group.dataset_set.add(dataset)

        # Parse the resulting vcf, grab variant objects
        parse_alignment_group_vcf(self.alignment_group, Dataset.TYPE.VCF_LUMPY)

        # Grab the resulting variants.
        variants = Variant.objects.filter(
            reference_genome=self.reference_genome)

        # There should be a Variant object for each sv event.
        self.assertEqual(2, len(variants))

        # One event should be located very close to 25k
        va_positions = [v.position for v in variants]
        va_offset = [25000 - va_pos for va_pos in va_positions]
        self.assertTrue(any([v < 50 for v in va_offset]))
예제 #27
0
def get_split_reads(sample_alignment):
    """Isolate split reads from a sample alignment.

    This uses a python script supplied with Lumppy, that is run as a
    separate process.

    NOTE THAT THIS SCRIPT ONLY WORKS WITH BWA MEM.
    """
    bwa_split_dataset = get_dataset_with_type(
            sample_alignment, Dataset.TYPE.BWA_SPLIT)
    if bwa_split_dataset is not None:
        if (bwa_split_dataset.status == Dataset.STATUS.READY and
                os.path.exists(bwa_split_dataset.get_absolute_location())):
            return bwa_split_dataset
    else:
        bwa_split_dataset = Dataset.objects.create(
                label=Dataset.TYPE.BWA_SPLIT,
                type=Dataset.TYPE.BWA_SPLIT,
                status=Dataset.STATUS.NOT_STARTED)
        sample_alignment.dataset_set.add(bwa_split_dataset)

    # If here, we are going to run or re-run the Dataset.
    bwa_split_dataset.status = Dataset.STATUS.NOT_STARTED
    bwa_split_dataset.save(update_fields=['status'])

    bam_dataset = get_dataset_with_type(sample_alignment, Dataset.TYPE.BWA_ALIGN)
    bam_filename = bam_dataset.get_absolute_location()

    assert os.path.exists(bam_filename), "BAM file '%s' is missing." % (
            bam_filename)

    # NOTE: This assumes the index just adds at .bai, w/ same path otherwise
    # - will this always be true?
    if not os.path.exists(bam_filename+'.bai'):
        index_bam_file(bam_filename)

    bam_split_filename = os.path.join(sample_alignment.get_model_data_dir(),
            'bwa_split_reads.bam')

    # Use lumpy bwa-mem split read script to pull out split reads.
    filter_split_reads = ' | '.join([
            '{samtools} view -h {bam_filename}',
            'python {lumpy_bwa_mem_sr_script} -i stdin',
            '{samtools} view -Sb -']).format(
                    samtools=settings.SAMTOOLS_BINARY,
                    bam_filename=bam_filename,
                    lumpy_bwa_mem_sr_script=
                            settings.LUMPY_EXTRACT_SPLIT_READS_BWA_MEM)

    try:
        bwa_split_dataset.status = Dataset.STATUS.COMPUTING
        bwa_split_dataset.save(update_fields=['status'])

        with open(bam_split_filename, 'w') as fh:
            subprocess.check_call(filter_split_reads,
                    stdout=fh,
                    shell=True,
                    executable=settings.BASH_PATH)

        # sort the split reads, overwrite the old file
        subprocess.check_call([settings.SAMTOOLS_BINARY, 'sort', bam_split_filename,
                os.path.splitext(bam_split_filename)[0]])

        _filter_out_interchromosome_reads(bam_split_filename)

        bwa_split_dataset.status = Dataset.STATUS.READY
        bwa_split_dataset.filesystem_location = clean_filesystem_location(
                bam_split_filename)

    except subprocess.CalledProcessError:
        # if there are no split reads, then fail.
        bwa_split_dataset.filesystem_location = ''
        bwa_split_dataset.status = Dataset.STATUS.FAILED

    bwa_split_dataset.save()

    return bwa_split_dataset
예제 #28
0
def get_discordant_read_pairs(sample_alignment):
    """Isolate discordant pairs of reads from a sample alignment.
    """
    # First, check if completed dataset already exists.
    bwa_disc_dataset = get_dataset_with_type(
            sample_alignment, Dataset.TYPE.BWA_DISCORDANT)
    if bwa_disc_dataset is not None:
        if (bwa_disc_dataset.status == Dataset.STATUS.READY and
                os.path.exists(bwa_disc_dataset.get_absolute_location())):
            return bwa_disc_dataset
    else:
        bwa_disc_dataset = Dataset.objects.create(
                label=Dataset.TYPE.BWA_DISCORDANT,
                type=Dataset.TYPE.BWA_DISCORDANT)
        sample_alignment.dataset_set.add(bwa_disc_dataset)

    # If here, we are going to run or re-run the Dataset.
    bwa_disc_dataset.status = Dataset.STATUS.NOT_STARTED
    bwa_disc_dataset.save(update_fields=['status'])

    bam_dataset = get_dataset_with_type(sample_alignment, Dataset.TYPE.BWA_ALIGN)
    bam_filename = bam_dataset.get_absolute_location()

    assert os.path.exists(bam_filename), "BAM file '%s' is missing." % (
            bam_filename)

    # NOTE: This assumes the index just adds at .bai, w/ same path otherwise
    # - will this always be true?
    if not os.path.exists(bam_filename+'.bai'):
        index_bam_file(bam_filename)

    bam_discordant_filename = os.path.join(sample_alignment.get_model_data_dir(),
            'bwa_discordant_pairs.bam')


    try:
        bwa_disc_dataset.status = Dataset.STATUS.COMPUTING
        bwa_disc_dataset.save(update_fields=['status'])

        # Use bam read alignment flags to pull out discordant pairs only
        filter_discordant = ' | '.join([
                '{samtools} view -u -F 0x0002 {bam_filename} ',
                '{samtools} view -u -F 0x0100 - ',
                '{samtools} view -u -F 0x0004 - ',
                '{samtools} view -u -F 0x0008 - ',
                '{samtools} view -b -F 0x0400 - ']).format(
                        samtools=settings.SAMTOOLS_BINARY,
                        bam_filename=bam_filename)

        with open(bam_discordant_filename, 'w') as fh:
            subprocess.check_call(filter_discordant,
                    stdout=fh, shell=True, executable=settings.BASH_PATH)

        # sort the discordant reads, overwrite the old file
        subprocess.check_call([settings.SAMTOOLS_BINARY, 'sort', bam_discordant_filename,
                os.path.splitext(bam_discordant_filename)[0]])

        _filter_out_interchromosome_reads(bam_discordant_filename)

        bwa_disc_dataset.filesystem_location = clean_filesystem_location(
                bam_discordant_filename)
        bwa_disc_dataset.status = Dataset.STATUS.READY

    except subprocess.CalledProcessError:
        bwa_disc_dataset.filesystem_location = ''
        bwa_disc_dataset.status = Dataset.STATUS.FAILED

    bwa_disc_dataset.save()

    return bwa_disc_dataset
예제 #29
0
def align_with_bwa_mem(alignment_group, sample_alignment):
    """
    REPLACES OLD BWA PIPELINE USING ALN AND SAMPE/SAMSE
    Aligns a sample to a reference genome using the bwa tool.

    Args:
        alignment_group: AlignmentGroup that this alignment is part of.
        sample_alignment: ExperimentSampleToAlignment. The respective dataset
            is assumed to have been created as well.
    """

    # Start by gettng fresh objects from database.
    sample_alignment = ExperimentSampleToAlignment.objects.get(
            id=sample_alignment.id)
    experiment_sample = sample_alignment.experiment_sample
    alignment_group = AlignmentGroup.objects.get(id=alignment_group.id)


    # Grab the reference genome fasta for the alignment.
    ref_genome_fasta = get_dataset_with_type(
            alignment_group.reference_genome,
            Dataset.TYPE.REFERENCE_GENOME_FASTA).get_absolute_location()

    # Get the BWA Dataset and set it to computing.
    bwa_dataset = sample_alignment.dataset_set.get(
                type=Dataset.TYPE.BWA_ALIGN)
    bwa_dataset.status = Dataset.STATUS.COMPUTING
    bwa_dataset.save(update_fields=['status'])

    # Create a file that we'll write stderr to.
    error_path = os.path.join(sample_alignment.get_model_data_dir(),
            'bwa_align.error')
    error_output = open(error_path, 'w')

    # The alignment group is now officially ALIGNING.
    if alignment_group.status != AlignmentGroup.STATUS.ALIGNING:
        alignment_group.status = AlignmentGroup.STATUS.ALIGNING
        alignment_group.start_time = datetime.now()
        alignment_group.end_time = None
        alignment_group.save(update_fields=['status', 'start_time', 'end_time'])

    error_output.write(
            "==START OF ALIGNMENT PIPELINE FOR %s, (%s) ==\n" % (
                    sample_alignment.experiment_sample.label,
                    sample_alignment.uid))

    # We wrap the alignment logic in a try-except so that if an error occurs,
    # we record it and update the status of the Dataset to FAILED if anything
    # should fail.
    try:
        # Build index if the index doesn't exist.
        # NOTE: When aligning multiple samples to the same reference genome
        # concurrently, the build index method should be called once to completion
        # before starting the concurrent alignment jobs.
        ensure_bwa_index(ref_genome_fasta)

        # Grab the fastq sources, and determine whether we are doing paired ends.
        # First, grab fastq1, which must exist
        fq1_queryset = experiment_sample.dataset_set.filter(
                type=Dataset.TYPE.FASTQ1)
        assert fq1_queryset, "Must have at least one .fastq file"
        fq1_dataset = fq1_queryset[0]
        input_reads_1_fq = fq1_dataset.wrap_if_compressed()
        input_reads_1_fq_path = fq1_dataset.get_absolute_location()

        # Second, check if fastq2 exists and set is_paired_end
        fq2_queryset = experiment_sample.dataset_set.filter(
                type=Dataset.TYPE.FASTQ2)
        if fq2_queryset:
            is_paired_end = True
            fq2_dataset = fq2_queryset[0]
            input_reads_2_fq = fq2_dataset.wrap_if_compressed()
            input_reads_2_fq_path = fq2_dataset.get_absolute_location()
        else:
            is_paired_end = False

        # 1. Generate SA coordinates.
        read_fq_1_path, read_fq_1_fn = os.path.split(input_reads_1_fq_path)

        align_input_args = ' '.join([
            '%s/bwa/bwa' % settings.TOOLS_DIR,
            'mem',
            '-t', '1', # threads
            '-R', '"'+read_group_string(experiment_sample)+'"',
            # uncomment this to keep secondary alignments (for finding and marking paralogy regions)
            # But before we can uncomment we need to fix de novo assembly code
            '-a',
            ref_genome_fasta,
            input_reads_1_fq,
        ])

        if is_paired_end:
            read_fq_2_path, read_fq_2_fn = os.path.split(input_reads_2_fq_path)
            align_input_args += ' ' + input_reads_2_fq

        # To skip saving the SAM file to disk directly, pipe output directly to
        # make a BAM file.
        align_input_args += ' | ' + settings.SAMTOOLS_BINARY + ' view -bS -'

        ### 2. Generate SAM output.
        output_bam = os.path.join(sample_alignment.get_model_data_dir(),
                'bwa_align.bam')

        error_output.write(align_input_args)

        # Flush the output here so it gets written before the alignments.
        error_output.flush()

        with open(output_bam, 'w') as fh:
            subprocess.check_call(align_input_args,
                    stdout=fh, stderr=error_output,
                    shell=True, executable=settings.BASH_PATH)

        # Set processing mask to not compute insert metrics if reads are
        # not paired end, as the lumpy script only works on paired end reads
        opt_processing_mask = {}
        if not is_paired_end:
            opt_processing_mask['compute_insert_metrics'] = False

        # Do several layers of processing on top of the initial alignment.
        result_bam_file = process_sam_bam_file(sample_alignment,
                alignment_group.reference_genome, output_bam, error_output,
                opt_processing_mask=opt_processing_mask)

        # Add the resulting file to the dataset.
        bwa_dataset.filesystem_location = clean_filesystem_location(
                result_bam_file)
        bwa_dataset.save()

        # Isolate split and discordant reads for SV calling.
        get_discordant_read_pairs(sample_alignment)
        get_split_reads(sample_alignment)

        # Add track to JBrowse.
        add_bam_file_track(alignment_group.reference_genome, sample_alignment,
                Dataset.TYPE.BWA_ALIGN)

        bwa_dataset.status = Dataset.STATUS.READY
        bwa_dataset.save()

        delete_redundant_files(sample_alignment.get_model_data_dir())

    except:
        import traceback
        error_output.write(traceback.format_exc())
        bwa_dataset.status = Dataset.STATUS.FAILED
        bwa_dataset.save()
        return
    finally:
        print error_path
        error_output.write('==END OF ALIGNMENT PIPELINE==\n')
        error_output.close()

        # Add the error Dataset to the object.
        error_dataset = Dataset.objects.create(
                label=Dataset.TYPE.BWA_ALIGN_ERROR,
                type=Dataset.TYPE.BWA_ALIGN_ERROR,
                filesystem_location=clean_filesystem_location(error_path))
        sample_alignment.dataset_set.add(error_dataset)
        sample_alignment.save()

        return sample_alignment
def generate_new_reference_genome(variant_set, new_ref_genome_params):
    """Uses reference_genome_maker code to create a new ReferenceGenome
    from the given VariantSet (applies Variants to existing ReferenceGenome.)

    Args:
        variant_set: The VariantSet from which we'll generate the new
            ReferenceGenome.
        new_ref_genome_params: Dictionary of params, including:
            * label (required): Label for the new ReferenceGenome.

    Returns:
        The new ReferenceGenome.

    Raises:
        ValidationException if we don't support this use case.
    """
    try:
        # Validate / parse params.
        assert 'label' in new_ref_genome_params
        new_ref_genome_label = new_ref_genome_params['label']

        original_ref_genome = variant_set.reference_genome

        # Create a ReferenceGenome to track the position.
        reference_genome = ReferenceGenome.objects.create(
            project=original_ref_genome.project, label=new_ref_genome_label)

        # Location for the generated Genbank.
        filename_prefix = generate_safe_filename_prefix_from_label(
            new_ref_genome_label)
        output_root = os.path.join(reference_genome.get_model_data_dir(),
                                   filename_prefix)
        full_output_path = clean_filesystem_location(output_root + '.genbank')

        # Dataset to track the location.
        dataset = Dataset.objects.create(
            label=Dataset.TYPE.REFERENCE_GENOME_GENBANK,
            type=Dataset.TYPE.REFERENCE_GENOME_GENBANK,
            filesystem_location=full_output_path,
            status=Dataset.STATUS.NOT_STARTED)
        reference_genome.dataset_set.add(dataset)

        # Prepare params for calling referece_genome_maker.
        # If the old genome is annotated, use it, otherwise, use the FASTA.
        # The BioPython SeqRecord should be the same either way.
        if original_ref_genome.is_annotated():
            original_genome_path = original_ref_genome.dataset_set.get(
                    type=Dataset.TYPE.REFERENCE_GENOME_GENBANK).\
                        get_absolute_location()
            sequence_record = SeqIO.read(original_genome_path, 'genbank')
        else:
            original_genome_path = original_ref_genome.dataset_set.get(
                    type=Dataset.TYPE.REFERENCE_GENOME_FASTA).\
                        get_absolute_location()
            sequence_record = SeqIO.read(original_genome_path, 'fasta')

        filename_prefix = generate_safe_filename_prefix_from_label(
            new_ref_genome_label)
        output_root = os.path.join(reference_genome.get_model_data_dir(),
                                   filename_prefix)

        # Create a fake, empty vcf path for now, as we're just getting
        # end-to-end to work.
        if not os.path.exists(settings.TEMP_FILE_ROOT):
            os.mkdir(settings.TEMP_FILE_ROOT)
        _, vcf_path = tempfile.mkstemp(suffix='_' + filename_prefix + '.vcf',
                                       dir=settings.TEMP_FILE_ROOT)

        with open(vcf_path, 'w') as vcf_fh:
            export_variant_set_as_vcf(variant_set, vcf_fh)

        dataset.status = Dataset.STATUS.COMPUTING
        dataset.save(update_fields=['status'])

        try:
            new_ref_genome_seq_record = reference_genome_maker.run(
                sequence_record, output_root, vcf_path)
        except Exception as e:
            dataset.status = Dataset.STATUS.FAILED
            dataset.save(update_fields=['status'])
            raise e

        reference_genome.save()
        dataset.status = Dataset.STATUS.READY
        dataset.save(update_fields=['status'])

        # Since the post_add_seq_to_ref_genome() signal couldn't run before,
        # we need to make sure to run it now.
        prepare_ref_genome_related_datasets(reference_genome, dataset)

        return reference_genome

    except Exception as e:
        raise ValidationException(str(e))
def generate_new_reference_genome(variant_set, new_ref_genome_params):
    """Uses reference_genome_maker code to create a new ReferenceGenome
    from the given VariantSet (applies Variants to existing ReferenceGenome.)

    Args:
        variant_set: The VariantSet from which we'll generate the new
            ReferenceGenome.
        new_ref_genome_params: Dictionary of params, including:
            * label (required): Label for the new ReferenceGenome.

    Returns:
        The new ReferenceGenome.

    Raises:
        ValidationException if we don't support this use case.
    """
    try:
        # Validate / parse params.
        assert 'label' in new_ref_genome_params
        new_ref_genome_label = new_ref_genome_params['label']

        original_ref_genome = variant_set.reference_genome

        # Create a ReferenceGenome to track the position.
        reference_genome = ReferenceGenome.objects.create(
                project=original_ref_genome.project,
                label=new_ref_genome_label)

        # Location for the generated Genbank.
        filename_prefix = generate_safe_filename_prefix_from_label(
                new_ref_genome_label)
        output_root = os.path.join(reference_genome.get_model_data_dir(),
                filename_prefix)
        full_output_path = clean_filesystem_location(output_root + '.genbank')

        # Dataset to track the location.
        dataset = Dataset.objects.create(
                label=Dataset.TYPE.REFERENCE_GENOME_GENBANK,
                type=Dataset.TYPE.REFERENCE_GENOME_GENBANK,
                filesystem_location=full_output_path,
                status=Dataset.STATUS.NOT_STARTED)
        reference_genome.dataset_set.add(dataset)

        # Prepare params for calling referece_genome_maker.
        original_fasta_path = original_ref_genome.dataset_set.get(
                type=Dataset.TYPE.REFERENCE_GENOME_FASTA).\
                        get_absolute_location()
        sequence_record = SeqIO.read(original_fasta_path, 'fasta')

        filename_prefix = generate_safe_filename_prefix_from_label(
                new_ref_genome_label)
        output_root = os.path.join(reference_genome.get_model_data_dir(),
                filename_prefix)

        # Create a fake, empty vcf path for now, as we're just getting
        # end-to-end to work.
        if not os.path.exists(settings.TEMP_FILE_ROOT):
            os.mkdir(settings.TEMP_FILE_ROOT)
        _, vcf_path = tempfile.mkstemp(
                suffix='_' + filename_prefix + '.vcf',
                dir=settings.TEMP_FILE_ROOT)

        with open(vcf_path, 'w') as vcf_fh:
            export_variant_set_as_vcf(variant_set, vcf_fh)

        dataset.status = Dataset.STATUS.COMPUTING
        dataset.save(update_fields=['status'])

        try:
            new_ref_genome_seq_record = reference_genome_maker.run(
                    sequence_record, output_root, vcf_path)
        except Exception as e:
            dataset.status = Dataset.STATUS.FAILED
            dataset.save(update_fields=['status'])
            raise e

        reference_genome.save()
        dataset.status = Dataset.STATUS.READY
        dataset.save(update_fields=['status'])

        # Since the post_add_seq_to_ref_genome() signal couldn't run before,
        # we need to make sure to run it now.
        prepare_ref_genome_related_datasets(reference_genome, dataset)

        return reference_genome

    except Exception as e:
        raise ValidationException(str(e))
예제 #32
0
    def derivation_fn(sample_alignment, new_dataset):
        """Creates a bam file with reads relevant for de novo assembly.
        """
        # Get the original bam file.
        bam_dataset = get_dataset_with_type(sample_alignment,
                Dataset.TYPE.BWA_ALIGN)
        orig_bam_filename = bam_dataset.get_absolute_location()

        # Allocate a filename for the new dataset.
        de_novo_bam_filelocation = (os.path.splitext(orig_bam_filename)[0] +
                '.de_novo.bam')
        new_dataset.filesystem_location = clean_filesystem_location(
                de_novo_bam_filelocation)
        new_dataset.save(update_fields=['filesystem_location'])

        ### Strategy
        # 0. Create intermediate sam
        # 1. Grab unmapped reads.
        #     a. If the corresponding pair for any read was mapped (and thus not
        #        in the unmapped dataset), grab that from the original bam file.
        # 2. Grab split reads.
        # 2b. Add reads that are in intervals of interest.
        # 3. Sort by name so that paired reads are next to each other
        # 4. Filter out duplicate reads. Requires sort in step 3.
        # 5. Convert to bam.
        # 6. Delete intermediate files.

        # 0. Create intermediate files.
        intermediate_sam = (
                os.path.splitext(de_novo_bam_filelocation)[0] + '.int.sam')
        paired_intermediate_sam = (
                os.path.splitext(intermediate_sam)[0] + '.paired.sam')
        sorted_intermediate_sam = (
                os.path.splitext(paired_intermediate_sam)[0] + '.sorted.sam')
        deduped_sorted_intermediate_sam = (
                os.path.splitext(sorted_intermediate_sam)[0] + '.deduped.sam')
        intermediate_files = [
                intermediate_sam,
                paired_intermediate_sam,
                sorted_intermediate_sam,
                deduped_sorted_intermediate_sam
        ]

        # 1. Get unmapped reads.
        unmapped_reads_dataset = get_unmapped_reads(sample_alignment)
        unmapped_reads_bam_file = unmapped_reads_dataset.get_absolute_location()
        cmd = '{samtools} view -h {bam_filename}'.format(
                samtools=settings.SAMTOOLS_BINARY,
                bam_filename=unmapped_reads_bam_file)
        with open(intermediate_sam, 'w') as output_fh:
           subprocess.check_call(cmd, stdout=output_fh, shell=True)

        # 2. Grab split reads.
        split_reads_dataset = get_split_reads(sample_alignment)
        split_rads_bam_file = split_reads_dataset.get_absolute_location()
        cmd = '{samtools} view {bam_filename}'.format(
                samtools=settings.SAMTOOLS_BINARY,
                bam_filename=split_rads_bam_file)
        with open(intermediate_sam, 'a') as output_fh:
           subprocess.check_call(cmd, stdout=output_fh, shell=True)

        # 2b. Add reads that are in any of intervals that we want to include.
        if force_include_reads_in_intervals:
            with open(intermediate_sam, 'a') as output_fh:
                get_reads_in_interval_list(orig_bam_filename,
                        force_include_reads_in_intervals, output_fh)

        # 2c. For each of the reads that we've included, grab their
        # corresponding pairs.
        add_paired_mates(intermediate_sam, orig_bam_filename,
                paired_intermediate_sam)

        # 3. Sort by name so that paired reads are next to each other
        cmd = (
                '(grep ^"@" {sam_file}; grep -v ^"@" {sam_file} | '
                'sort -k1,1 -k2,2n) > {sorted_sam_file}'
        ).format(
                sam_file=paired_intermediate_sam,
                sorted_sam_file=sorted_intermediate_sam)
        subprocess.call(cmd, shell=True)

        # 4. Filter out duplicate reads. Requires sort in step 3.
        cmd = 'uniq {sorted_sam_file} > {deduped_sam_file}'.format(
                sorted_sam_file=sorted_intermediate_sam,
                deduped_sam_file=deduped_sorted_intermediate_sam)
        subprocess.call(cmd, shell=True)

        # 5. Convert to bam.
        cmd = '{samtools} view -Sb {sam_file} > {final_bam_file}'.format(
                samtools=settings.SAMTOOLS_BINARY,
                sam_file=intermediate_sam,
                final_bam_file=de_novo_bam_filelocation)
        subprocess.call(cmd, shell=True)

        # 6. Delete intermediate files.
        for f in intermediate_files:
            os.remove(f)
예제 #33
0
def align_with_bwa_mem(alignment_group, sample_alignment):
    """
    REPLACES OLD BWA PIPELINE USING ALN AND SAMPE/SAMSE
    Aligns a sample to a reference genome using the bwa tool.

    Args:
        alignment_group: AlignmentGroup that this alignment is part of.
        sample_alignment: ExperimentSampleToAlignment. The respective dataset
            is assumed to have been created as well.
    """

    # Start by gettng fresh objects from database.
    sample_alignment = ExperimentSampleToAlignment.objects.get(
        id=sample_alignment.id)
    experiment_sample = sample_alignment.experiment_sample
    alignment_group = AlignmentGroup.objects.get(id=alignment_group.id)

    # Grab the reference genome fasta for the alignment.
    ref_genome_fasta = get_dataset_with_type(
        alignment_group.reference_genome,
        Dataset.TYPE.REFERENCE_GENOME_FASTA).get_absolute_location()

    # Get the BWA Dataset and set it to computing.
    bwa_dataset = sample_alignment.dataset_set.get(type=Dataset.TYPE.BWA_ALIGN)
    bwa_dataset.status = Dataset.STATUS.COMPUTING
    bwa_dataset.save(update_fields=['status'])

    # Create a file that we'll write stderr to.
    error_path = os.path.join(sample_alignment.get_model_data_dir(),
                              'bwa_align.error')
    error_output = open(error_path, 'w')

    # The alignment group is now officially ALIGNING.
    if alignment_group.status != AlignmentGroup.STATUS.ALIGNING:
        alignment_group.status = AlignmentGroup.STATUS.ALIGNING
        alignment_group.start_time = datetime.now()
        alignment_group.end_time = None
        alignment_group.save(
            update_fields=['status', 'start_time', 'end_time'])

    error_output.write(
        "==START OF ALIGNMENT PIPELINE FOR %s, (%s) ==\n" %
        (sample_alignment.experiment_sample.label, sample_alignment.uid))

    # We wrap the alignment logic in a try-except so that if an error occurs,
    # we record it and update the status of the Dataset to FAILED if anything
    # should fail.
    try:
        # Build index if the index doesn't exist.
        # NOTE: When aligning multiple samples to the same reference genome
        # concurrently, the build index method should be called once to completion
        # before starting the concurrent alignment jobs.
        ensure_bwa_index(ref_genome_fasta)

        # Grab the fastq sources, and determine whether we are doing paired ends.
        # First, grab fastq1, which must exist
        fq1_queryset = experiment_sample.dataset_set.filter(
            type=Dataset.TYPE.FASTQ1)
        assert fq1_queryset, "Must have at least one .fastq file"
        fq1_dataset = fq1_queryset[0]
        input_reads_1_fq = fq1_dataset.wrap_if_compressed()
        input_reads_1_fq_path = fq1_dataset.get_absolute_location()

        # Second, check if fastq2 exists and set is_paired_end
        fq2_queryset = experiment_sample.dataset_set.filter(
            type=Dataset.TYPE.FASTQ2)
        if fq2_queryset:
            is_paired_end = True
            fq2_dataset = fq2_queryset[0]
            input_reads_2_fq = fq2_dataset.wrap_if_compressed()
            input_reads_2_fq_path = fq2_dataset.get_absolute_location()
        else:
            is_paired_end = False

        # 1. Generate SA coordinates.
        read_fq_1_path, read_fq_1_fn = os.path.split(input_reads_1_fq_path)

        align_input_args = ' '.join([
            '%s/bwa/bwa' % settings.TOOLS_DIR,
            'mem',
            '-t',
            '1',  # threads
            '-R',
            '"' + read_group_string(experiment_sample) + '"',
            # uncomment this to keep secondary alignments (for finding and marking paralogy regions)
            # But before we can uncomment we need to fix de novo assembly code
            '-a',
            ref_genome_fasta,
            input_reads_1_fq,
        ])

        if is_paired_end:
            read_fq_2_path, read_fq_2_fn = os.path.split(input_reads_2_fq_path)
            align_input_args += ' ' + input_reads_2_fq

        # To skip saving the SAM file to disk directly, pipe output directly to
        # make a BAM file.
        align_input_args += ' | ' + settings.SAMTOOLS_BINARY + ' view -bS -'

        ### 2. Generate SAM output.
        output_bam = os.path.join(sample_alignment.get_model_data_dir(),
                                  'bwa_align.bam')

        error_output.write(align_input_args)

        # Flush the output here so it gets written before the alignments.
        error_output.flush()

        with open(output_bam, 'w') as fh:
            subprocess.check_call(align_input_args,
                                  stdout=fh,
                                  stderr=error_output,
                                  shell=True,
                                  executable=settings.BASH_PATH)

        # Set processing mask to not compute insert metrics if reads are
        # not paired end, as the lumpy script only works on paired end reads
        opt_processing_mask = {}
        if not is_paired_end:
            opt_processing_mask['compute_insert_metrics'] = False

        # Do several layers of processing on top of the initial alignment.
        result_bam_file = process_sam_bam_file(
            sample_alignment,
            alignment_group.reference_genome,
            output_bam,
            error_output,
            opt_processing_mask=opt_processing_mask)

        # Add the resulting file to the dataset.
        bwa_dataset.filesystem_location = clean_filesystem_location(
            result_bam_file)
        bwa_dataset.save()

        # Isolate split and discordant reads for SV calling.
        get_discordant_read_pairs(sample_alignment)
        get_split_reads(sample_alignment)

        # Add track to JBrowse.
        add_bam_file_track(alignment_group.reference_genome, sample_alignment,
                           Dataset.TYPE.BWA_ALIGN)

        bwa_dataset.status = Dataset.STATUS.READY
        bwa_dataset.save()

        delete_redundant_files(sample_alignment.get_model_data_dir())

    except:
        import traceback
        error_output.write(traceback.format_exc())
        bwa_dataset.status = Dataset.STATUS.FAILED
        bwa_dataset.save()
        return
    finally:
        print error_path
        error_output.write('==END OF ALIGNMENT PIPELINE==\n')
        error_output.close()

        # Add the error Dataset to the object.
        error_dataset = Dataset.objects.create(
            label=Dataset.TYPE.BWA_ALIGN_ERROR,
            type=Dataset.TYPE.BWA_ALIGN_ERROR,
            filesystem_location=clean_filesystem_location(error_path))
        sample_alignment.dataset_set.add(error_dataset)
        sample_alignment.save()

        return sample_alignment