def _create_alignment(self, haploid=False): # Create a new alignment group. self.alignment_group = AlignmentGroup.objects.create( label='test alignment', reference_genome=self.REFERENCE_GENOME) if haploid: self.alignment_group.alignment_options['call_as_haploid'] = True # Create a sample. self.sample_1 = ExperimentSample.objects.create( uid=self.FAKE_READS_SAMPLE_UID, project=self.project, label='sample1') ### Add the raw reads copy_and_add_dataset_source(self.sample_1, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ1, self.FAKE_READS_FASTQ1) copy_and_add_dataset_source(self.sample_1, Dataset.TYPE.FASTQ2, Dataset.TYPE.FASTQ2, self.FAKE_READS_FASTQ2) # Create alignment to the sample. sample_alignment = ExperimentSampleToAlignment.objects.create( alignment_group=self.alignment_group, experiment_sample=self.sample_1) ### Add alignment data. NOTE: Stored in sample model dir. copy_dest = copy_dataset_to_entity_data_dir(self.sample_1, self.FAKE_READS_BAM) copy_dataset_to_entity_data_dir(self.sample_1, self.FAKE_READS_BAM_INDEX) add_dataset_to_entity(sample_alignment, Dataset.TYPE.BWA_ALIGN, Dataset.TYPE.BWA_ALIGN, copy_dest)
def _create_alignment(self, haploid=False): # Create a new alignment group. self.alignment_group = AlignmentGroup.objects.create( label='test alignment', reference_genome=self.REFERENCE_GENOME) if haploid: self.alignment_group.alignment_options['call_as_haploid'] = True # Create a sample. self.sample_1 = ExperimentSample.objects.create( uid=self.FAKE_READS_SAMPLE_UID, project=self.project, label='sample1') ### Add the raw reads copy_and_add_dataset_source(self.sample_1, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ1, self.FAKE_READS_FASTQ1) copy_and_add_dataset_source(self.sample_1, Dataset.TYPE.FASTQ2, Dataset.TYPE.FASTQ2, self.FAKE_READS_FASTQ2) # Create alignment to the sample. sample_alignment = ExperimentSampleToAlignment.objects.create( alignment_group=self.alignment_group, experiment_sample=self.sample_1) ### Add alignment data. NOTE: Stored in sample model dir. copy_dest = copy_dataset_to_entity_data_dir( self.sample_1, self.FAKE_READS_BAM) copy_dataset_to_entity_data_dir(self.sample_1, self.FAKE_READS_BAM_INDEX) add_dataset_to_entity(sample_alignment, Dataset.TYPE.BWA_ALIGN, Dataset.TYPE.BWA_ALIGN, copy_dest)
def compute_insert_metrics(bam_file, sample_alignment, stderr=None): """Computes read fragment insert size distribution. Creates a Dataset for each of: * histogram file * file with mean and stdev comma-separated """ histo_file = os.path.splitext(bam_file)[0] + '.insert_size_histogram.txt' mean_stdev_file = (os.path.splitext(bam_file)[0] + '.insert_size_mean_stdev.txt') # First, we analyze the bam distribution. read_bam_cmd = [ settings.SAMTOOLS_BINARY, 'view', bam_file ] p1 = Popen(read_bam_cmd, stdout=PIPE, stderr=stderr) read_length = get_read_length(bam_file) pairend_distro_cmd = [ settings.LUMPY_PAIREND_DISTRO_BIN, '-r', str(read_length), '-X', '4', # num stdevs from end to extend '-N', '10000', # number to sample '-o', histo_file ] p2 = Popen(pairend_distro_cmd, stdin=p1.stdout, stdout=PIPE, stderr=stderr) # Allow p1 to receive a SIGPIPE if p2 exits. p1.stdout.close() # Run the command and get mean, stdev mean_and_stdev_str = p2.communicate()[0] raw_mean, raw_stdev = mean_and_stdev_str.split('\t') mean = int(float(raw_mean.split(':')[1].strip())) stdev = int(float(raw_stdev.split(':')[1].strip())) # Lumpy doesn't like stdev of 0. if stdev < 1: stdev = 1 # Save the histogram file as a Dataset. add_dataset_to_entity(sample_alignment, Dataset.TYPE.LUMPY_INSERT_METRICS_HISTOGRAM, Dataset.TYPE.LUMPY_INSERT_METRICS_HISTOGRAM, filesystem_location=histo_file) # Write mean, stdev to another file and create another Dataset. with open(mean_stdev_file, 'w') as fh: fh.write("%d,%d" % (mean, stdev)) add_dataset_to_entity(sample_alignment, Dataset.TYPE.LUMPY_INSERT_METRICS_MEAN_STDEV, Dataset.TYPE.LUMPY_INSERT_METRICS_MEAN_STDEV, filesystem_location=mean_stdev_file)
def test_run_alignment_with_spaces_in_genbank_filename(self): project = self.common_entities['project'] ref_genome_label = 'dirty_upload' request = HttpRequest() request.POST = { 'projectUid': project.uid, 'refGenomeLabel': ref_genome_label, 'importFileFormat': 'genbank' } request.method = 'POST' request.user = self.common_entities['user'] authenticate(username=TEST_USERNAME, password=TEST_PASSWORD) self.assertTrue(request.user.is_authenticated()) request.FILES['refGenomeFile'] = UploadedFile( file=open(TEST_GENBANK), name='dirty_genbank (spaces).gb') response = create_ref_genome_from_browser_upload(request) self.assertEqual(STATUS_CODE__SUCCESS, response.status_code) self.assertFalse(json.loads(response.content).get('error', False)) # Get reference genome ref_genome = ReferenceGenome.objects.get(project=project, label=ref_genome_label) # Create sample model sample = ExperimentSample.objects.create(project=project, label='test_sample') # Add fastq datasets to sample add_dataset_to_entity(sample, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ1, filesystem_location=TEST_DIRTY_FQ_1) # Add fastq datasets to sample add_dataset_to_entity(sample, Dataset.TYPE.FASTQ2, Dataset.TYPE.FASTQ2, filesystem_location=TEST_DIRTY_FQ_2) # Run alignment of sample to reference alignment_group_label = 'test_alignment' sample_list = [sample] result = run_pipeline(alignment_group_label, ref_genome, sample_list) alignment_group = result[0] alignment_async_result = result[1] variant_calling_async_result = result[2] alignment_async_result.get() variant_calling_async_result.get() alignment_group = AlignmentGroup.objects.get(uid=alignment_group.uid) self.assertEqual(AlignmentGroup.STATUS.COMPLETED, alignment_group.status)
def _perform_assembly(self, data_dict): ref_fasta = data_dict['ref_fasta'] fq_1 = data_dict['fq_1'] fq_2 = data_dict['fq_2'] # Import reference genome ref_genome = import_reference_genome_from_local_file( self.project, 'test_ref', ref_fasta, 'fasta', move=False) # Create sample model sample = ExperimentSample.objects.create( project=self.project, label='test_sample') # Add fastq datasets to sample add_dataset_to_entity( sample, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ1, filesystem_location=fq_1) add_dataset_to_entity( sample, Dataset.TYPE.FASTQ2, Dataset.TYPE.FASTQ2, filesystem_location=fq_2) # Run alignment of sample to reference alignment_group_label = 'test_alignment' sample_list = [sample] alignment_group, _, _ = run_pipeline( alignment_group_label, ref_genome, sample_list, perform_variant_calling=False, alignment_options={}) # Get resulting ExperimentSampleToAlignment sample_align = ExperimentSampleToAlignment.objects.get( alignment_group=alignment_group, experiment_sample=sample) # Run pipeline and wait on result async_result = run_de_novo_assembly_pipeline([sample_align]) async_result.get() # Retrieve contigs contigs = Contig.objects.filter( parent_reference_genome=ref_genome, experiment_sample_to_alignment=sample_align) return contigs
def _get_or_create_sv_dataset(key): dataset_query = sample_alignment.dataset_set.filter(type=key) if dataset_query.exists() and not overwrite or ( dataset_query.exists() and key not in sv_indicant_class_to_generator): assert len(dataset_query) == 1 return dataset_query[0] elif dataset_query.exists() and overwrite and ( key in sv_indicant_class_to_generator): assert len(dataset_query) == 1 dataset_query[0].delete() if (overwrite and key in sv_indicant_class_to_generator) or ( not dataset_query.exists()): dataset_path = '.'.join([ alignment_file_prefix, sv_indicant_class_to_filename_suffix[key], 'bam' ]) generator = sv_indicant_class_to_generator[key] generator(alignment_bam, dataset_path) return add_dataset_to_entity( sample_alignment, key, key, filesystem_location=dataset_path)
def _get_or_create_sv_dataset(key): dataset_query = sample_alignment.dataset_set.filter(type=key) if dataset_query.exists() and not overwrite or ( dataset_query.exists() and key not in sv_indicant_class_to_generator): assert len(dataset_query) == 1 return dataset_query[0] elif dataset_query.exists() and overwrite and ( key in sv_indicant_class_to_generator): assert len(dataset_query) == 1 dataset_query[0].delete() if (overwrite and key in sv_indicant_class_to_generator) or ( not dataset_query.exists()): dataset_path = '.'.join([ alignment_file_prefix, sv_indicant_class_to_filename_suffix[key], 'bam' ]) generator = sv_indicant_class_to_generator[key] generator(alignment_bam, dataset_path) return add_dataset_to_entity(sample_alignment, key, key, filesystem_location=dataset_path)
def cov_detect_deletion_make_vcf(sample_alignment): """Uses coverage data to call large deletions and creates a VCF_COV_DETECT_DELETIONS dataset for the sample alignment Args: sample_alignment: ExperimentSampleToAlignment instance """ # Don't proceed if processing this sample alignment previously failed or # in another async process. sample_alignment = ExperimentSampleToAlignment.objects.get( uid=sample_alignment.uid) if (sample_alignment.data.get('assembly_status') == ExperimentSampleToAlignment.ASSEMBLY_STATUS.FAILED): return # Set assembly status for UI # NOTE: Setting this status is playing whack-a-mole against other async sv # detection functions, e.g. assembly.generate_contigs(). set_assembly_status( sample_alignment, ExperimentSampleToAlignment.ASSEMBLY_STATUS.ANALYZING_COVERAGE) print "Generating coverage data\n" chrom_regions = get_deleted_regions(sample_alignment) var_dict_list = make_var_dict_list( chrom_regions, get_fasta(sample_alignment.alignment_group.reference_genome)) if var_dict_list: vcf_path = os.path.join( sample_alignment.get_model_data_dir(), 'cov_detect_deletion.vcf') # Write variant dicts to vcf export_var_dict_list_as_vcf(var_dict_list, vcf_path, sample_alignment, CUSTOM_SV_METHOD__COVERAGE) # Make dataset for contigs vcf new_dataset = add_dataset_to_entity( sample_alignment, Dataset.TYPE.VCF_COV_DETECT_DELETIONS, Dataset.TYPE.VCF_COV_DETECT_DELETIONS, vcf_path) new_dataset.save() # Update status again if not FAILED. sample_alignment = ExperimentSampleToAlignment.objects.get( uid=sample_alignment.uid) if (sample_alignment.data.get('assembly_status') != ExperimentSampleToAlignment.ASSEMBLY_STATUS.FAILED): set_assembly_status( sample_alignment, ExperimentSampleToAlignment.ASSEMBLY_STATUS.WAITING_TO_PARSE)
def make_contig_reads_dataset(contig, sv_indicant_reads_in_contig): ''' Using the contig reads generated by extract_contig_reads(), generate a bam file, index and sort it. ''' # Get bam filename extracted_reads_bam_file = os.path.join( contig.get_model_data_dir(), 'sv_indicants.bam') bwa_align_bam = contig.experiment_sample_to_alignment.dataset_set.get( type=Dataset.TYPE.BWA_ALIGN).get_absolute_location() sam_file = pysam.AlignmentFile(bwa_align_bam) # Write extracted reads into bam file extracted_reads_alignment_file = pysam.AlignmentFile( extracted_reads_bam_file, "wb", template=sam_file) sam_file.close() for read in sv_indicant_reads_in_contig: extracted_reads_alignment_file.write(read) extracted_reads_alignment_file.close() coordinate_sorted_bam = (os.path.splitext(extracted_reads_bam_file)[0] + '.coordinate_sorted.bam') sort_bam_by_coordinate(extracted_reads_bam_file, coordinate_sorted_bam) index_bam(coordinate_sorted_bam) # Add the bam file to contig as BWA_SV_INDICANTS dataset, overwriting it # if it already exists dataset_query = contig.dataset_set.filter( type=Dataset.TYPE.BWA_SV_INDICANTS) if dataset_query.count(): dataset_query[0].delete() add_dataset_to_entity(contig, Dataset.TYPE.BWA_SV_INDICANTS, Dataset.TYPE.BWA_SV_INDICANTS, filesystem_location=coordinate_sorted_bam)
def cov_detect_deletion_make_vcf(sample_alignment): """Uses coverage data to call large deletions and creates a VCF_COV_DETECT_DELETIONS dataset for the sample alignment Args: sample_alignment: ExperimentSampleToAlignment instance """ # Don't proceed if processing this sample alignment previously failed or # in another async process. sample_alignment = ExperimentSampleToAlignment.objects.get( uid=sample_alignment.uid) if (sample_alignment.data.get('assembly_status') == ExperimentSampleToAlignment.ASSEMBLY_STATUS.FAILED): return # Set assembly status for UI # NOTE: Setting this status is playing whack-a-mole against other async sv # detection functions, e.g. assembly.generate_contigs(). set_assembly_status( sample_alignment, ExperimentSampleToAlignment.ASSEMBLY_STATUS.ANALYZING_COVERAGE) print "Generating coverage data\n" chrom_regions = get_deleted_regions(sample_alignment) var_dict_list = make_var_dict_list( chrom_regions, get_fasta(sample_alignment.alignment_group.reference_genome)) if var_dict_list: vcf_path = os.path.join(sample_alignment.get_model_data_dir(), 'cov_detect_deletion.vcf') # Write variant dicts to vcf export_var_dict_list_as_vcf(var_dict_list, vcf_path, sample_alignment, CUSTOM_SV_METHOD__COVERAGE) # Make dataset for contigs vcf new_dataset = add_dataset_to_entity( sample_alignment, Dataset.TYPE.VCF_COV_DETECT_DELETIONS, Dataset.TYPE.VCF_COV_DETECT_DELETIONS, vcf_path) new_dataset.save() # Update status again if not FAILED. sample_alignment = ExperimentSampleToAlignment.objects.get( uid=sample_alignment.uid) if (sample_alignment.data.get('assembly_status') != ExperimentSampleToAlignment.ASSEMBLY_STATUS.FAILED): set_assembly_status( sample_alignment, ExperimentSampleToAlignment.ASSEMBLY_STATUS.WAITING_TO_PARSE)
def _make_fake_contig(label, esta): c = Contig.objects.create(label=label, parent_reference_genome=ag.reference_genome, experiment_sample_to_alignment=esta) c.metadata['coverage'] = random.random() * 100 # Add fasta. c.ensure_model_data_dir_exists() # Random sequence. num_bases = random.randint(0, 100) seq = Seq(''.join([random.choice('ATCG') for i in range(num_bases)])) seq_record = SeqRecord(seq, id=c.uid) dataset_path = os.path.join(c.get_model_data_dir(), 'fasta.fa') with open(dataset_path, 'w') as fh: SeqIO.write(seq_record, fh, 'fasta') add_dataset_to_entity(c, 'contig_fasta', Dataset.TYPE.REFERENCE_GENOME_FASTA, filesystem_location=dataset_path) c.save()
def make_altalign_dataset(sample_alignment): sample_alignment_bam = sample_alignment.dataset_set.get( type=Dataset.TYPE.BWA_ALIGN).get_absolute_location() alignment_file_prefix = os.path.join(sample_alignment.get_model_data_dir(), 'bwa_align') altalign_bam = '.'.join([alignment_file_prefix, 'altalign', 'bam']) get_altalign_reads(sample_alignment_bam, altalign_bam) return add_dataset_to_entity(sample_alignment, Dataset.TYPE.BWA_ALTALIGN, Dataset.TYPE.BWA_ALTALIGN, filesystem_location=altalign_bam)
def make_contig_reads_dataset(contig, sv_indicant_reads_in_contig): ''' Using the contig reads generated by extract_contig_reads(), generate a bam file, index and sort it. ''' # Get bam filename extracted_reads_bam_file = os.path.join(contig.get_model_data_dir(), 'sv_indicants.bam') bwa_align_bam = contig.experiment_sample_to_alignment.dataset_set.get( type=Dataset.TYPE.BWA_ALIGN).get_absolute_location() sam_file = pysam.AlignmentFile(bwa_align_bam) # Write extracted reads into bam file extracted_reads_alignment_file = pysam.AlignmentFile( extracted_reads_bam_file, "wb", template=sam_file) sam_file.close() for read in sv_indicant_reads_in_contig: extracted_reads_alignment_file.write(read) extracted_reads_alignment_file.close() coordinate_sorted_bam = (os.path.splitext(extracted_reads_bam_file)[0] + '.coordinate_sorted.bam') sort_bam_by_coordinate(extracted_reads_bam_file, coordinate_sorted_bam) index_bam(coordinate_sorted_bam) # Add the bam file to contig as BWA_SV_INDICANTS dataset, overwriting it # if it already exists dataset_query = contig.dataset_set.filter( type=Dataset.TYPE.BWA_SV_INDICANTS) if dataset_query.count(): dataset_query[0].delete() add_dataset_to_entity(contig, Dataset.TYPE.BWA_SV_INDICANTS, Dataset.TYPE.BWA_SV_INDICANTS, filesystem_location=coordinate_sorted_bam)
def _create_samples(self, fq_1, fq_2, num=1): sample_list = [] for sample_num in range(num): sample = ExperimentSample.objects.create( project=self.project, label='test_sample_' + str(sample_num)) # Add fastq datasets to sample add_dataset_to_entity( sample, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ1, filesystem_location=fq_1) add_dataset_to_entity( sample, Dataset.TYPE.FASTQ2, Dataset.TYPE.FASTQ2, filesystem_location=fq_2) sample_list.append(sample) return sample_list
def _make_fake_contig(label, esta): c = Contig.objects.create( label=label, parent_reference_genome=ag.reference_genome, experiment_sample_to_alignment=esta) c.metadata['coverage'] = random.random() * 100 # Add fasta. c.ensure_model_data_dir_exists() # Random sequence. num_bases = random.randint(0, 100) seq = Seq(''.join([random.choice('ATCG') for i in range(num_bases)])) seq_record = SeqRecord(seq, id=c.uid) dataset_path = os.path.join(c.get_model_data_dir(), 'fasta.fa') with open(dataset_path, 'w') as fh: SeqIO.write(seq_record, fh, 'fasta') add_dataset_to_entity( c, 'contig_fasta', Dataset.TYPE.REFERENCE_GENOME_FASTA, filesystem_location=dataset_path) c.save()
def make_altalign_dataset(sample_alignment): sample_alignment_bam = sample_alignment.dataset_set.get( type=Dataset.TYPE.BWA_ALIGN).get_absolute_location() alignment_file_prefix = os.path.join( sample_alignment.get_model_data_dir(), 'bwa_align') altalign_bam = '.'.join([ alignment_file_prefix, 'altalign', 'bam' ]) get_altalign_reads(sample_alignment_bam, altalign_bam) return add_dataset_to_entity( sample_alignment, Dataset.TYPE.BWA_ALTALIGN, Dataset.TYPE.BWA_ALTALIGN, filesystem_location=altalign_bam)
def test_end_to_end(self): """Test running full pipline on small-ish data. The data file consists of 20,000 bases. At 5,000 bases there is a 400 base deletion. At 10,000 bases there is a 400 base inversion. At 15,000 bases there is a 400 base tandem duplication. It seems that Pindel cannot find the inversion. Fortunately, delly can usually find inversions. Unfortunately, delly only works well on large data, so we will not test it here. """ # Create a new alignment group. alignment_group = AlignmentGroup.objects.create( label='test alignment', reference_genome=self.reference_genome) # Create a sample. sample_1 = ExperimentSample.objects.create(uid=TEST_SAMPLE_UID, project=self.project, label='sample1') ### Add the raw reads copy_and_add_dataset_source(sample_1, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ1, TEST_FASTQ1) copy_and_add_dataset_source(sample_1, Dataset.TYPE.FASTQ2, Dataset.TYPE.FASTQ2, TEST_FASTQ2) # Create relationship between alignment and sample. sample_alignment = ExperimentSampleToAlignment.objects.create( alignment_group=alignment_group, experiment_sample=sample_1) ### Add alignment data. NOTE: Stored in sample model dir. # index (no dataset) copy_dataset_to_entity_data_dir(sample_1, TEST_BAM_INDEX) # bam file (with dataset) copy_dest = copy_dataset_to_entity_data_dir(sample_1, TEST_BAM) add_dataset_to_entity(sample_alignment, Dataset.TYPE.BWA_ALIGN, Dataset.TYPE.BWA_ALIGN, copy_dest) # Make sure there are no variants before. self.assertEqual( 0, len(Variant.objects.filter( reference_genome=self.reference_genome))) # Test with Pindel only for now. for tool in ['pindel']: find_variants_with_tool(alignment_group, VARIANT_TOOL_PARAMS_MAP[tool], project=self.project) # Check that the alignment group has a freebayes vcf dataset associated # with it. vcf_dataset = get_dataset_with_type(alignment_group, Dataset.TYPE.VCF_PINDEL) self.assertIsNotNone(vcf_dataset) # Make sure the .vcf file actually exists. self.assertTrue(os.path.exists(vcf_dataset.get_absolute_location())) # Make sure the vcf is valid by reading it using pyvcf. with open(vcf_dataset.get_absolute_location()) as vcf_fh: try: reader = vcf.Reader(vcf_fh) reader.next() except: self.fail("Not valid vcf") # Grab the resulting variants. variants = Variant.objects.filter( reference_genome=self.reference_genome) # Confirm that 2 variants found. self.assertEqual(2, len(variants)) variant_map = {} for variant in variants: variant_alternates = VariantAlternate.objects.filter( variant=variant) # There should be only one variant alternate per SV. self.assertEqual(len(variant_alternates), 1) pos = variant.position svtype = variant_alternates[0].data['INFO_SVTYPE'] svlen = variant_alternates[0].data['INFO_SVLEN'] variant_map[svtype] = (pos, svlen) # Check that there is a deletion around base 5000. self.assertTrue('DEL' in variant_map) self.assertTrue(abs(variant_map['DEL'][0] - 5000) <= 3) self.assertTrue(abs(variant_map['DEL'][1] - 400) <= 3) # Check that there is a tandem duplication around base 15000. self.assertTrue('DUP:TANDEM' in variant_map) self.assertTrue(abs(variant_map['DUP:TANDEM'][0] - 15000) <= 3) self.assertTrue(abs(variant_map['DUP:TANDEM'][1] - 400) <= 3)
def align_contig_reads_to_contig(contig): # Get fasta of reads used to make contig contig_reads_fasta = os.path.join( contig.get_model_data_dir(), 'extracted_reads.fa') # Pull out contig read qnames and put in dictionary contig_reads p1 = re.compile('>(\S+)/(\d)') contig_reads = defaultdict(list) with open(contig_reads_fasta) as fh: for line in fh: m1 = p1.match(line) if m1: read_id = m1.group(1) read_number = int(m1.group(2)) contig_reads[read_id].append(read_number) # Get source reads fastqs sample = contig.experiment_sample_to_alignment.experiment_sample source_fq1 = sample.dataset_set.get( type=Dataset.TYPE.FASTQ1).get_absolute_location() source_fq2_query = sample.dataset_set.filter( type=Dataset.TYPE.FASTQ2) is_paired_end = source_fq2_query.exists() if is_paired_end: source_fq2 = source_fq2_query[0].get_absolute_location() # Make filenames for contig read fastqs output_fq1 = os.path.join( contig.get_model_data_dir(), 'reads.1.fq') if is_paired_end: output_fq2 = os.path.join( contig.get_model_data_dir(), 'reads.2.fq') # Go through source fastqs and write reads in contig_reads to file source_fq_list = [source_fq1] output_fq_list = [output_fq1] if is_paired_end: source_fq_list.append(source_fq2) output_fq_list.append(output_fq2) p1 = re.compile('@(\S+)') for input_fq_path, output_fq_path in zip(source_fq_list, output_fq_list): if input_fq_path.endswith('.fq'): file_like = open(input_fq_path) elif input_fq_path.endswith('.gz'): file_like = gzip.open(input_fq_path) else: raise Exception('Compression type not supported') with file_like as in_fh, \ open(output_fq_path, 'w') as out_fh: for line in in_fh: m1 = p1.match(line) if m1: qname = m1.group(1) if qname in contig_reads: out_fh.write(line) out_fh.write(in_fh.next()) out_fh.write(in_fh.next()) out_fh.write(in_fh.next()) # Align fastqs to contig fasta contig_fasta = contig.dataset_set.get( type=Dataset.TYPE.REFERENCE_GENOME_FASTA).get_absolute_location() contig_reads_to_contig_bam = os.path.join( contig.get_model_data_dir(), 'reads_to_contig.bam') simple_align_paired_with_bwa_mem( output_fq_list, contig_fasta, contig_reads_to_contig_bam) # Coordinate sort and index bam for jbrowse coordinate_sorted_bam = (os.path.splitext(contig_reads_to_contig_bam)[0] + '.coordinate_sorted.bam') sort_bam_by_coordinate(contig_reads_to_contig_bam, coordinate_sorted_bam) index_bam(coordinate_sorted_bam) # Add the bam file to contig as BWA_ALIGN dataset, overwriting it # if it already exists dataset_query = contig.dataset_set.filter( type=Dataset.TYPE.BWA_ALIGN) if dataset_query.count(): dataset_query[0].delete() add_dataset_to_entity( contig, Dataset.TYPE.BWA_ALIGN, Dataset.TYPE.BWA_ALIGN, filesystem_location=coordinate_sorted_bam)
def bootstrap_fake_data(): """Fill the database with fake data. """ user = get_or_create_user() ### Create some projects (test_project, project_created) = Project.objects.get_or_create( title=TEST_PROJECT_NAME, owner=user.get_profile()) (test_project_2, project_created) = Project.objects.get_or_create( title=SV_PROJECT_NAME, owner=user.get_profile()) ### Create some reference genomes ref_genome_1 = import_reference_genome_from_local_file( test_project, REF_GENOME_1_LABEL, TEST_FASTA, 'fasta') ref_genome_2 = import_reference_genome_from_local_file( test_project, REF_GENOME_2_LABEL, TEST_FASTA, 'fasta') ref_genome_3 = import_reference_genome_from_local_file( test_project, 'test_genome', TEST_FASTA, 'fasta') ### Create some saved queries. for saved_query_text in CUSTOM_SAVED_QUERY_LIST: SavedVariantFilterQuery.objects.get_or_create( owner=user.get_profile(), text=saved_query_text) ### Create some ExperimentSamples. # Create some samples without backing data just to explore the UI. ExperimentSample.objects.create( project=test_project, label='C321D_MiSeq', data = {'SAMPLE_WELL': 'A01'} ) ExperimentSample.objects.create( project=test_project, label='C321D Fixed 01', data = {'SAMPLE_WELL': 'A02'} ) ExperimentSample.objects.create( project=test_project, label='C321D Fixed 02', data = {'SAMPLE_WELL': 'A03'} ) # Create some samples with backing data. (sample_1, created) = ExperimentSample.objects.get_or_create( project=test_project, label=SAMPLE_1_LABEL) # Add datasets to the samples. if not sample_1.dataset_set.filter(type=Dataset.TYPE.FASTQ1): copy_and_add_dataset_source(sample_1, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ1, TEST_FASTQ1) if not sample_1.dataset_set.filter(type=Dataset.TYPE.FASTQ2): copy_and_add_dataset_source(sample_1, Dataset.TYPE.FASTQ2, Dataset.TYPE.FASTQ2, TEST_FASTQ2) # Create sample backed by g-zipped data. gz_backed_sample = ExperimentSample.objects.create( project=test_project, label='sample backed by gz data') gz_fastq1_dataset = copy_and_add_dataset_source( gz_backed_sample, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ1, TEST_FASTQ_GZ_1) gz_fastq2_dataset = copy_and_add_dataset_source( gz_backed_sample, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ2, TEST_FASTQ_GZ_2) run_fastqc_on_sample_fastq(gz_backed_sample, gz_fastq1_dataset) run_fastqc_on_sample_fastq(gz_backed_sample, gz_fastq2_dataset, rev=True) ### Create an alignment. alignment_group_1 = AlignmentGroup.objects.create( label='Alignment 1', reference_genome=ref_genome_3, aligner=AlignmentGroup.ALIGNER.BWA) # Link it to a sample. sample_alignment = ExperimentSampleToAlignment.objects.create( alignment_group=alignment_group_1, experiment_sample=sample_1) ### Add alignment data. NOTE: Stored in sample model dir. # NOTE: This is a bit convoluted. Perhaps it would be better to store alignments # in the ExperimentSampleToAlignment directory. copy_dest = copy_dataset_to_entity_data_dir(sample_1, TEST_BAM) copy_dataset_to_entity_data_dir(sample_1, TEST_BAM_INDEX) add_dataset_to_entity(sample_alignment, Dataset.TYPE.BWA_ALIGN, Dataset.TYPE.BWA_ALIGN, copy_dest) # Create fake variants. create_fake_variants_and_variant_sets(ref_genome_1) ############################# # Full VCF Testing (annotated for snpeff, variant filtering, etc) ############################# # Create a new reference genome and samples using full_vcf_test_set full_vcf_reference_genome = import_reference_genome_from_local_file( test_project, 'mg1655_tolC_through_zupT', FullVCFTestSet.TEST_GENBANK, 'genbank') # Create all samples. parent_obj = None full_vcf_samples = [] for i in range(FullVCFTestSet.NUM_SAMPLES): sample_obj = ExperimentSample.objects.create( project=test_project, label='Sample %d' % i) sample_obj.data['SAMPLE_WELL'] = 'A0%d' % (i+1) if i == 0: parent_obj = sample_obj if i > 0: sample_obj.data['SAMPLE_PARENTS'] = parent_obj.label parent_obj.add_child(sample_obj) parent_obj.save() sample_obj.save() # Add raw reads to each sample. fastq1_dataset = copy_and_add_dataset_source(sample_obj, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ1, FullVCFTestSet.FASTQ1[i]) fastq2_dataset = copy_and_add_dataset_source(sample_obj, Dataset.TYPE.FASTQ2, Dataset.TYPE.FASTQ2, FullVCFTestSet.FASTQ2[i]) # Run FASTQC on sample reads. run_fastqc_on_sample_fastq(sample_obj, fastq1_dataset) run_fastqc_on_sample_fastq(sample_obj, fastq2_dataset, rev=True) full_vcf_samples.append(sample_obj) # Run the alignment. Return the alignment group created, indexed by the # reference genome's uid. (full_vcf_alignment_group, pipeline_async_result) = run_pipeline( 'test_align', full_vcf_reference_genome, full_vcf_samples) import_variant_set_from_vcf(full_vcf_reference_genome, 'Designed', FullVCFTestSet.TEST_DESIGNED_SNPS) def _create_region_intervals(region, interval_tuple_list): """Helper method to create RegionIntervals for a Region. Args: region: Region Model object. interval_tuple_list: List of tuples of intervals to create. """ for interval in interval_tuple_list: RegionInterval.objects.create( region=region, start=interval[0], end=interval[1]) # Create some fake regions. # TODO: Should not be much harder to replace this with real regions. region_1 = Region.objects.create( reference_genome=full_vcf_reference_genome, label='region_1', type=Region.TYPE.POOR_MAPPING_QUALITY) _create_region_intervals(region_1, [(1,150), (300, 400), (500, 900)]) region_2 = Region.objects.create( reference_genome=full_vcf_reference_genome, label='region_2', type=Region.TYPE.POOR_MAPPING_QUALITY) _create_region_intervals(region_2, [(1000, 1500)]) region_3 = Region.objects.create( reference_genome=full_vcf_reference_genome, label='region_3', type=Region.TYPE.POOR_MAPPING_QUALITY) _create_region_intervals(region_3, [(1800, 1900), (2150, 2300)]) # And some GENE regions. gene_A = Region.objects.create( reference_genome=full_vcf_reference_genome, label='geneA', type=Region.TYPE.GENE) _create_region_intervals(gene_A, [(2000, 2400)]) gene_B = Region.objects.create( reference_genome=full_vcf_reference_genome, label='geneB', type=Region.TYPE.GENE) _create_region_intervals(gene_B, [(4800, 5200)]) gene_C = Region.objects.create( reference_genome=full_vcf_reference_genome, label='geneC', type=Region.TYPE.GENE) _create_region_intervals(gene_C, [(1, 500)]) # Bootstrap test_project_2 with SV stuff sv_testing_bootstrap(test_project_2)
def evaluate_contigs(contig_uid_list, skip_extracted_read_alignment=False, use_read_alignment=True): if not contig_uid_list: return def _length_weighted_coverage(contig): return contig.num_bases * contig.coverage # Request contig_list from db and order by highest length weighted coverage contig_list = list(Contig.objects.filter(uid__in=contig_uid_list)) contig_list.sort(key=_length_weighted_coverage, reverse=True) # All contigs have have same sample_alignment so grab sample alignment from # the first one. contig = contig_list[0] sample_alignment = contig.experiment_sample_to_alignment ref_genome = sample_alignment.alignment_group.reference_genome # Attempt placing contigs. Get back placeable contigs, # translocation variants (dict obj), and mobile elements translocation # variants (dict obj). placeable_contig_uid_list, var_dict_list, me_var_dict_list = graph_contig_placement( contig_uid_list, skip_extracted_read_alignment, use_read_alignment) # update contig list with new features from graph_contig_placement contig_list = list(Contig.objects.filter(uid__in=contig_uid_list)) contig_list.sort(key=_length_weighted_coverage, reverse=True) # Annotate contig with the gene names that they fall within 50 bp of. annotate_contig_junctions(contig_uid_list, ref_genome, dist=50) # Handle placeable contigs, if any. if len(placeable_contig_uid_list): placeable_contigs = Contig.objects.filter( uid__in=placeable_contig_uid_list) for contig in placeable_contigs: contig.metadata['is_placeable'] = True contig.save() placeable_contig_vcf_path = os.path.join( sample_alignment.get_model_data_dir(), 'de_novo_assembled_contigs.vcf') # Write contigs to vcf export_contig_list_as_vcf(placeable_contigs, placeable_contig_vcf_path) # Make dataset for contigs vcf add_dataset_to_entity( sample_alignment, Dataset.TYPE.VCF_DE_NOVO_ASSEMBLED_CONTIGS, Dataset.TYPE.VCF_DE_NOVO_ASSEMBLED_CONTIGS, placeable_contig_vcf_path) # Handle other types of contig objects, if any. var_dict_vcf_path = os.path.join( sample_alignment.get_model_data_dir(), 'de_novo_assembly_translocations.vcf') me_var_dict_vcf_path = os.path.join( sample_alignment.get_model_data_dir(), 'de_novo_assembly_me_translocations.vcf') for var_dl, method, path, dataset_type in [ (var_dict_list, 'GRAPH_WALK', var_dict_vcf_path, Dataset.TYPE.VCF_DE_NOVO_ASSEMBLY_GRAPH_WALK), (me_var_dict_list, 'ME_GRAPH_WALK', me_var_dict_vcf_path, Dataset.TYPE.VCF_DE_NOVO_ASSEMBLY_ME_GRAPH_WALK)]: if not var_dl: continue # Write variant dicts to vcf export_var_dict_list_as_vcf( var_dl, path, sample_alignment, method) # Make dataset for contigs vcf add_dataset_to_entity( sample_alignment, dataset_type, dataset_type, path)
def assemble_with_velvet(assembly_dir, velvet_opts, sv_indicants_bam, sample_alignment, overwrite=True, reassemble_contig_from_reads=False): # NOTE: Unused. If enabled, will call make_contig_reads_to_ref_alignments() # which is not used anywhere currently due to performance issues which # which are particularly bad when many unused reads. assert not reassemble_contig_from_reads timestamp = str(datetime.datetime.now()) contig_number_pattern = re.compile('^NODE_(\d+)_') reference_genome = sample_alignment.alignment_group.reference_genome contig_files = [] contig_uid_list = [] _run_velvet(assembly_dir, velvet_opts, sv_indicants_bam) # Collect resulting contigs fasta contigs_fasta = os.path.join(assembly_dir, 'contigs.fa') contig_files.append(contigs_fasta) records = list(SeqIO.parse(contigs_fasta, 'fasta')) digits = len(str(len(records))) + 1 for (i, seq_record) in enumerate(records, 1): # Extract contig sequence from the contigs.fa file, number, and # name it. contig_node_number = int( contig_number_pattern.findall( seq_record.description)[0]) coverage = float(seq_record.description.rsplit('_', 1)[1]) seq_record.seq = reduce( lambda x, y: x + y, [seq for seq in seq_record.seq.split('N')]) seq_record.id = seq_record.name = seq_record.description = ( 'NODE_' + str(i)) leading_zeros = digits - len(str(i)) contig_label = '%s_%s' % ( sample_alignment.experiment_sample.label, leading_zeros * '0' + str(i)) # Create model and metadata. contig = Contig.objects.create( label=contig_label, parent_reference_genome=reference_genome, experiment_sample_to_alignment=( sample_alignment)) contig.metadata['coverage'] = coverage contig.metadata['timestamp'] = timestamp contig.metadata['node_number'] = contig_node_number contig.metadata['assembly_dir'] = assembly_dir contig.ensure_model_data_dir_exists() # NOTE: Unused code. # Reassemble the contig from its constituent reads separately, # using a second velvet call. # if reassemble_contig_from_reads: # # 1. Grab reads from velvet to reassemble the contig # make_contig_reads_to_ref_alignments(contig, # add_jbrowse_track=False, overwrite=overwrite) # contig_reads_bam = os.path.join( # contig.get_model_data_dir(), # 'sv_indicants.bam') # # 2. Reassemble the contig from its whole reads using velvet - # # this generates longer contigs because the graph will trim the # # edges if there is a branchpoint. With only one node it should # # be very fast. # _run_velvet(contig.get_model_data_dir(), velvet_opts, # contig_reads_bam) # reassembled_seqrecord = _extract_node_from_contig_reassembly( # contig) # if reassembled_seqrecord: # seq_record.seq = reassembled_seqrecord.seq # Write the contig fasta and add it as a dataset to the contig object. dataset_path = os.path.join(contig.get_model_data_dir(), 'fasta.fa') with open(dataset_path, 'w') as fh: SeqIO.write([seq_record], fh, 'fasta') add_dataset_to_entity( contig, 'contig_fasta', Dataset.TYPE.REFERENCE_GENOME_FASTA, filesystem_location=dataset_path) contig.save() # NOTE: Disabled for now. Severe performance issues. # Make a bam track on the reference for each contig that shows only the # reads that assembled the contig and their mates # make_contig_reads_to_ref_alignments(contig.uid) # append the uid to the contig_uid_list contig_uid_list.append(contig.uid) # once contigs are extracted, remove velvet data _cleanup_velvet_dir(assembly_dir) return contig_uid_list
def _run_contig_walk_test(self, test_dir): ref_fasta = os.path.join(test_dir, 'ref.fa') target_fasta = os.path.join(test_dir, 'target.fa') contig_fasta_list = [] i = 0 contig_fasta_path = os.path.join(test_dir, 'contig_' + str(i) + '.fa') while os.path.exists(contig_fasta_path): contig_fasta_list.append(contig_fasta_path) i += 1 contig_fasta_path = os.path.join(test_dir, 'contig_' + str(i) + '.fa') dummy_models = self._make_dummy_models() reference_genome = dummy_models['reference_genome'] sample_alignment = dummy_models['sample_alignment'] alignment_group = dummy_models['alignment_group'] add_dataset_to_entity( reference_genome, Dataset.TYPE.REFERENCE_GENOME_FASTA, Dataset.TYPE.REFERENCE_GENOME_FASTA, filesystem_location=ref_fasta) # Make data_dir directory to house genome_finishing files assembly_dir = os.path.join( sample_alignment.get_model_data_dir(), 'assembly') # Make assembly directory os.mkdir(assembly_dir) data_dir = os.path.join(assembly_dir, '0') os.mkdir(data_dir) # Create contigs contig_list = [] for i, contig_fasta in enumerate(contig_fasta_list): contig = Contig.objects.create( parent_reference_genome=reference_genome, experiment_sample_to_alignment=sample_alignment, label='test_contig_' + str(i)) add_dataset_to_entity( contig, Dataset.TYPE.REFERENCE_GENOME_FASTA, Dataset.TYPE.REFERENCE_GENOME_FASTA, filesystem_location=contig_fasta) contig.metadata['assembly_dir'] = data_dir contig.metadata['node_number'] = i contig_list.append(contig) # Place contigs and create variants evaluate_contigs(contig_list, skip_extracted_read_alignment=True, use_read_alignment=False) parse_variants_from_vcf(sample_alignment) # Get set of de novo variants variant_set = create_de_novo_variants_set( alignment_group, 'de_novo_variants') self.assertTrue(variant_set.variants.exists()) self.assertEqual(len(variant_set.variants.all()), 1) # Make new reference genome new_ref_genome_params = {'label': 'new_ref'} new_ref_genome = generate_new_reference_genome( variant_set, new_ref_genome_params) # Verify insertion was placed correctly new_ref_genome_fasta = get_dataset_with_type( new_ref_genome, Dataset.TYPE.REFERENCE_GENOME_FASTA ).get_absolute_location() fastas_same, indexes = are_fastas_same( target_fasta, new_ref_genome_fasta) self.assertTrue(fastas_same)
def evaluate_contigs(contig_uid_list, skip_extracted_read_alignment=False, use_read_alignment=True): if not contig_uid_list: return def _length_weighted_coverage(contig): return contig.num_bases * contig.coverage # Request contig_list from db and order by highest length weighted coverage contig_list = list(Contig.objects.filter(uid__in=contig_uid_list)) contig_list.sort(key=_length_weighted_coverage, reverse=True) # All contigs have have same sample_alignment so grab sample alignment from # the first one. contig = contig_list[0] sample_alignment = contig.experiment_sample_to_alignment ref_genome = sample_alignment.alignment_group.reference_genome # Attempt placing contigs. Get back placeable contigs, # translocation variants (dict obj), and mobile elements translocation # variants (dict obj). placeable_contig_uid_list, var_dict_list, me_var_dict_list = graph_contig_placement( contig_uid_list, skip_extracted_read_alignment, use_read_alignment) # update contig list with new features from graph_contig_placement contig_list = list(Contig.objects.filter(uid__in=contig_uid_list)) contig_list.sort(key=_length_weighted_coverage, reverse=True) # Annotate contig with the gene names that they fall within 50 bp of. annotate_contig_junctions(contig_uid_list, ref_genome, dist=50) # Handle placeable contigs, if any. if len(placeable_contig_uid_list): placeable_contigs = Contig.objects.filter( uid__in=placeable_contig_uid_list) for contig in placeable_contigs: contig.metadata['is_placeable'] = True contig.save() placeable_contig_vcf_path = os.path.join( sample_alignment.get_model_data_dir(), 'de_novo_assembled_contigs.vcf') # Write contigs to vcf export_contig_list_as_vcf(placeable_contigs, placeable_contig_vcf_path) # Make dataset for contigs vcf add_dataset_to_entity(sample_alignment, Dataset.TYPE.VCF_DE_NOVO_ASSEMBLED_CONTIGS, Dataset.TYPE.VCF_DE_NOVO_ASSEMBLED_CONTIGS, placeable_contig_vcf_path) # Handle other types of contig objects, if any. var_dict_vcf_path = os.path.join(sample_alignment.get_model_data_dir(), 'de_novo_assembly_translocations.vcf') me_var_dict_vcf_path = os.path.join( sample_alignment.get_model_data_dir(), 'de_novo_assembly_me_translocations.vcf') for var_dl, method, path, dataset_type in [ (var_dict_list, 'GRAPH_WALK', var_dict_vcf_path, Dataset.TYPE.VCF_DE_NOVO_ASSEMBLY_GRAPH_WALK), (me_var_dict_list, 'ME_GRAPH_WALK', me_var_dict_vcf_path, Dataset.TYPE.VCF_DE_NOVO_ASSEMBLY_ME_GRAPH_WALK) ]: if not var_dl: continue # Write variant dicts to vcf export_var_dict_list_as_vcf(var_dl, path, sample_alignment, method) # Make dataset for contigs vcf add_dataset_to_entity(sample_alignment, dataset_type, dataset_type, path)
def _run_contig_walk_test(self, test_dir): ref_fasta = os.path.join(test_dir, 'ref.fa') self.target_fasta = os.path.join(test_dir, 'target.fa') contig_fasta_list = filter( lambda x: re.match(r'contig_\d+\.fa', x), os.listdir(test_dir)) contig_fasta_list = [os.path.join(test_dir, filename) for filename in contig_fasta_list] dummy_models = self._make_dummy_models() reference_genome = dummy_models['reference_genome'] sample_alignment = dummy_models['sample_alignment'] alignment_group = dummy_models['alignment_group'] add_dataset_to_entity( reference_genome, Dataset.TYPE.REFERENCE_GENOME_FASTA, Dataset.TYPE.REFERENCE_GENOME_FASTA, filesystem_location=ref_fasta) ref_genbank = os.path.join(test_dir, 'ref.gb') if os.path.exists(ref_genbank): add_dataset_to_entity( reference_genome, Dataset.TYPE.REFERENCE_GENOME_GENBANK, Dataset.TYPE.REFERENCE_GENOME_GENBANK, ref_genbank) reference_genome.ensure_mobile_element_multifasta() # Make data_dir directory to house genome_finishing files assembly_dir = os.path.join( sample_alignment.get_model_data_dir(), 'assembly') # Make assembly directory os.mkdir(assembly_dir) data_dir = os.path.join(assembly_dir, '0') os.mkdir(data_dir) # Create contigs contig_uid_list = [] for i, contig_fasta in enumerate(contig_fasta_list): contig = Contig.objects.create( parent_reference_genome=reference_genome, experiment_sample_to_alignment=sample_alignment, label='test_contig_' + str(i)) add_dataset_to_entity( contig, Dataset.TYPE.REFERENCE_GENOME_FASTA, Dataset.TYPE.REFERENCE_GENOME_FASTA, filesystem_location=contig_fasta) contig.metadata['assembly_dir'] = data_dir contig.metadata['node_number'] = i contig_uid_list.append(contig.uid) contig.save() # Place contigs and create variants evaluate_contigs(contig_uid_list, skip_extracted_read_alignment=True, use_read_alignment=False) parse_variants_from_vcf(sample_alignment) self.contig_uid_list = contig_uid_list # Get set of de novo variants variant_set = create_de_novo_variants_set( alignment_group, 'de_novo_variants') for v in variant_set.variants.all(): alts = v.get_alternates() assert len(alts) == 1 alt = alts[0] print '\npos:%s\nref: %dbp :%s\nalt: %dbp :%s\n' % ( v.position, len(v.ref_value), v.ref_value, len(alt), alt) return variant_set
def align_contig_reads_to_contig(contig): # Get fasta of reads used to make contig contig_reads_fasta = os.path.join(contig.get_model_data_dir(), 'extracted_reads.fa') # Pull out contig read qnames and put in dictionary contig_reads p1 = re.compile('>(\S+)/(\d)') contig_reads = defaultdict(list) with open(contig_reads_fasta) as fh: for line in fh: m1 = p1.match(line) if m1: read_id = m1.group(1) read_number = int(m1.group(2)) contig_reads[read_id].append(read_number) # Get source reads fastqs sample = contig.experiment_sample_to_alignment.experiment_sample source_fq1 = sample.dataset_set.get( type=Dataset.TYPE.FASTQ1).get_absolute_location() source_fq2_query = sample.dataset_set.filter(type=Dataset.TYPE.FASTQ2) is_paired_end = source_fq2_query.exists() if is_paired_end: source_fq2 = source_fq2_query[0].get_absolute_location() # Make filenames for contig read fastqs output_fq1 = os.path.join(contig.get_model_data_dir(), 'reads.1.fq') if is_paired_end: output_fq2 = os.path.join(contig.get_model_data_dir(), 'reads.2.fq') # Go through source fastqs and write reads in contig_reads to file source_fq_list = [source_fq1] output_fq_list = [output_fq1] if is_paired_end: source_fq_list.append(source_fq2) output_fq_list.append(output_fq2) p1 = re.compile('@(\S+)') for input_fq_path, output_fq_path in zip(source_fq_list, output_fq_list): if input_fq_path.endswith('.fq'): file_like = open(input_fq_path) elif input_fq_path.endswith('.gz'): file_like = gzip.open(input_fq_path) else: raise Exception('Compression type not supported') with file_like as in_fh, \ open(output_fq_path, 'w') as out_fh: for line in in_fh: m1 = p1.match(line) if m1: qname = m1.group(1) if qname in contig_reads: out_fh.write(line) out_fh.write(in_fh.next()) out_fh.write(in_fh.next()) out_fh.write(in_fh.next()) # Align fastqs to contig fasta contig_fasta = contig.dataset_set.get( type=Dataset.TYPE.REFERENCE_GENOME_FASTA).get_absolute_location() contig_reads_to_contig_bam = os.path.join(contig.get_model_data_dir(), 'reads_to_contig.bam') simple_align_paired_with_bwa_mem(output_fq_list, contig_fasta, contig_reads_to_contig_bam) # Coordinate sort and index bam for jbrowse coordinate_sorted_bam = (os.path.splitext(contig_reads_to_contig_bam)[0] + '.coordinate_sorted.bam') sort_bam_by_coordinate(contig_reads_to_contig_bam, coordinate_sorted_bam) index_bam(coordinate_sorted_bam) # Add the bam file to contig as BWA_ALIGN dataset, overwriting it # if it already exists dataset_query = contig.dataset_set.filter(type=Dataset.TYPE.BWA_ALIGN) if dataset_query.count(): dataset_query[0].delete() add_dataset_to_entity(contig, Dataset.TYPE.BWA_ALIGN, Dataset.TYPE.BWA_ALIGN, filesystem_location=coordinate_sorted_bam)
def graph_contig_placement(contig_uid_list, skip_extracted_read_alignment, use_alignment_reads=True): """Align contigs passed in contig_list to the reference and to any annotated mobile elements in the reference genbank and use the alignment to build a sequence graph. The sequence graph is then used by graph walking algorithms that call structural variants from paths in the graph. Args: contig_uid_list: list of Contig objects skip_extracted_read_alignment: if False, extract the reads that assembled each contig and make them a bam track on the reference use_alignment_reads: if True, filter contig placements that would delete regions of moderate coverage Returns: placeable_contig_uid_list: Contig objects with metadata fields holding their reference placement parameters var_dict_list: list of dictionary representations of translocation variants with keys: chromosome, pos, ref_seq, alt_seq me_var_dict_list: list of dictionary representations of mobile element translocation variants with keys: chromosome, pos, ref_seq, alt_seq, MEINFO """ def _length_weighted_coverage(contig): return contig.num_bases * contig.coverage # Request contig_list from db and order by highest length weighted coverage contig_list = list(Contig.objects.filter(uid__in=contig_uid_list)) contig_list.sort(key=_length_weighted_coverage, reverse=True) sample_alignment = contig_list[0].experiment_sample_to_alignment sample_alignment.data['assembly_status'] = ( ExperimentSampleToAlignment.ASSEMBLY_STATUS.BUILDING_SEQUENCE_GRAPH) sample_alignment.save() ref_genome = sample_alignment.alignment_group.reference_genome # Make Assembly dir assembly_dir = os.path.join(sample_alignment.get_model_data_dir(), 'assembly') contig_alignment_dir = os.path.join(assembly_dir, 'contig_alignment') if os.path.exists(contig_alignment_dir): shutil.rmtree(contig_alignment_dir) os.mkdir(contig_alignment_dir) # NOTE(gleb): Not sure whether these have to be ordered, but keeping # them ordered while refactoring. contigs_as_ordered_dict = OrderedDict([(c.uid, c) for c in contig_list]) # Concatenate contig fastas for alignment contig_concat = os.path.join(contig_alignment_dir, 'contig_concat.fa') with open(contig_concat, 'w') as output_fh: for contig_uid, c in contigs_as_ordered_dict.iteritems(): contig_fasta_file = get_fasta(c) with open(contig_fasta_file) as read_fh: output_fh.write(read_fh.read()) # Create dictionaries to translate contig uid to its fasta descriptor line contig_qname_to_uid = {} for contig_uid, c in contigs_as_ordered_dict.items(): contig_fasta_file = get_fasta(c) with open(contig_fasta_file, 'r') as fh: descriptor = fh.next() contig_qname_to_uid[descriptor.strip('>\n')] = contig_uid # Get extracted mobile elements in addition to contigs if ref_genome.is_annotated(): me_fa_dataset = get_dataset_with_type( ref_genome, Dataset.TYPE.MOBILE_ELEMENT_FASTA) me_concat_fasta = me_fa_dataset.get_absolute_location() contig_alignment_to_me_bam = os.path.join( contig_alignment_dir, 'contig_alignment_to_me.bam') if not os.path.exists(contig_alignment_to_me_bam): ensure_bwa_index(me_concat_fasta) simple_align_with_bwa_mem(contig_concat, me_concat_fasta, contig_alignment_to_me_bam, ['-T', '15']) # Align concatenated contig fastas to reference contig_alignment_bam = os.path.join(contig_alignment_dir, 'contig_alignment.bam') print 'Aligning contigs to reference' simple_align_with_bwa_mem(contig_concat, get_fasta(ref_genome), contig_alignment_bam, ['-T', '15']) # Create graph G = nx.DiGraph() # Create sequence interval instances for reference and each contig ref_intervals = SequenceIntervals(ref_genome.uid, ref_genome.num_bases, tag='ref') G.ref_intervals = ref_intervals add_alignment_to_graph(G, contig_alignment_bam) if ref_genome.is_annotated(): add_me_alignment_to_graph(G, contig_alignment_to_me_bam) # Add SEQUENCE_GRAPH_PICKLE dataset to sample alignment graph_pickle_path = os.path.join(contig_alignment_dir, 'sequence_graph.pickle') nx.write_gpickle(G, graph_pickle_path) add_dataset_to_entity(sample_alignment, Dataset.TYPE.SEQUENCE_GRAPH_PICKLE, Dataset.TYPE.SEQUENCE_GRAPH_PICKLE, graph_pickle_path) detect_strand_chromosome_junctions(contig_qname_to_uid, contig_alignment_bam) placeable_contig_uid_list = [] iv_list = novel_seq_ins_walk(G) if use_alignment_reads: coverage_stats = get_coverage_stats(sample_alignment) sample_alignment_bam = sample_alignment.dataset_set.get( type=Dataset.TYPE.BWA_ALIGN).get_absolute_location() for insertion_vertices in iv_list: contig_qname = insertion_vertices.enter_contig.seq_uid contig_uid = contig_qname_to_uid[contig_qname] contig = Contig.objects.get(uid=contig_uid) set_contig_placement_params(contig, insertion_vertices) if use_alignment_reads: # Filter out deletions of good coverage regions deletion_length = (insertion_vertices.enter_ref.pos - insertion_vertices.exit_ref.pos) if deletion_length > 0: deletion_cov = avg_coverage(sample_alignment_bam, contig.metadata['chromosome'], insertion_vertices.exit_ref.pos, insertion_vertices.enter_ref.pos) chrom_cov_stats = coverage_stats[contig.metadata['chromosome']] chrom_cov_mean = chrom_cov_stats['mean'] chrom_cov_std = chrom_cov_stats['std'] if deletion_length <= 0 or (deletion_cov < chrom_cov_mean - chrom_cov_std): placeable_contig_uid_list.append(contig.uid) else: placeable_contig_uid_list.append(contig.uid) # Perform translocation walk if ref_genome.num_chromosomes == 1: trans_iv_pairs = translocation_walk(G) var_dict_list = [ parse_path_into_ref_alt(iv_pair, contig_qname_to_uid, sample_alignment) for iv_pair in trans_iv_pairs ] var_dict_list = [ var_d for var_d in var_dict_list if any([var_d['ref_seq'], var_d['alt_seq']]) ] if ref_genome.is_annotated(): me_trans_iv_pairs = me_translocation_walk(G) me_var_dict_list = [ parse_path_into_ref_alt(iv_pair, contig_qname_to_uid, sample_alignment) for iv_pair in me_trans_iv_pairs ] else: me_var_dict_list = [] else: print 'Translocation walk not implemented for multi-chromosomal refs' var_dict_list = [] me_var_dict_list = [] return placeable_contig_uid_list, var_dict_list, me_var_dict_list
def test_add_variants_to_set_from_bed(self): common_entities = create_common_entities() project = common_entities['project'] self.ref_genome_1 = common_entities['reference_genome'] alignment_group = AlignmentGroup.objects.create( label='Alignment 1', reference_genome=self.ref_genome_1, aligner=AlignmentGroup.ALIGNER.BWA) (self.sample_1, created) = ExperimentSample.objects.get_or_create( project=project, label=SAMPLE_1_LABEL) sample_alignment = ExperimentSampleToAlignment.objects.create( alignment_group=alignment_group, experiment_sample=self.sample_1) # Create variants in the bed regions from best_test.bed for var_poor_map in range(20): variant = Variant.objects.create( type=Variant.TYPE.TRANSITION, reference_genome=self.ref_genome_1, chromosome=Chromosome.objects.get(reference_genome=self.ref_genome_1), position=random.randint(101,200), ref_value='A') vccd = VariantCallerCommonData.objects.create( variant=variant, source_dataset_id=1, alignment_group=alignment_group, data={} ) for var_no_cov in range(20): variant = Variant.objects.create( type=Variant.TYPE.TRANSITION, reference_genome=self.ref_genome_1, chromosome=Chromosome.objects.get(reference_genome=self.ref_genome_1), position=random.randint(301,400), ref_value='A') vccd = VariantCallerCommonData.objects.create( variant=variant, source_dataset_id=1, alignment_group=alignment_group, data={} ) variant = Variant.objects.create( type=Variant.TYPE.TRANSITION, reference_genome=self.ref_genome_1, chromosome=Chromosome.objects.get(reference_genome=self.ref_genome_1), position=random.randint(501,600), ref_value='A') vccd = VariantCallerCommonData.objects.create( variant=variant, source_dataset_id=1, alignment_group=alignment_group, data={} ) new_bed_path = copy_dataset_to_entity_data_dir( entity= sample_alignment, original_source_location= TEST_BED) bed_dataset = add_dataset_to_entity(sample_alignment, dataset_label= Dataset.TYPE.BED_CALLABLE_LOCI, dataset_type= Dataset.TYPE.BED_CALLABLE_LOCI, filesystem_location= new_bed_path) vs_to_v_map = add_variants_to_set_from_bed( sample_alignment, bed_dataset) variant_set_labels = set([vs.label for vs in vs_to_v_map.keys()]) self.assertEqual(set(['POOR_MAPPING_QUALITY', 'NO_COVERAGE']), variant_set_labels) for variant_set, variants in vs_to_v_map.items(): for v in variants: # POOR MAPPING QUAL should be from 101 to 200 if variant_set.label == 'POOR_MAPPING_QUALITY': self.assertTrue(v.position in pyinter.closedopen( 101, 200)) # NO COVERAGE should be from 301 to 400, 501 to 600 elif variant_set.label == 'NO_COVERAGE': self.assertTrue(v.position in pyinter.IntervalSet([ pyinter.closedopen(301,400), pyinter.closedopen(501,600)])) else: raise AssertionError( 'bad variant set %s made.' % variant_set.label)
def test_add_variants_to_set_from_bed(self): common_entities = create_common_entities() project = common_entities['project'] self.ref_genome_1 = common_entities['reference_genome'] alignment_group = AlignmentGroup.objects.create( label='Alignment 1', reference_genome=self.ref_genome_1, aligner=AlignmentGroup.ALIGNER.BWA) (self.sample_1, created) = ExperimentSample.objects.get_or_create( project=project, label=SAMPLE_1_LABEL) sample_alignment = ExperimentSampleToAlignment.objects.create( alignment_group=alignment_group, experiment_sample=self.sample_1) # Create variants in the bed regions from best_test.bed for var_poor_map in range(20): variant = Variant.objects.create( type=Variant.TYPE.TRANSITION, reference_genome=self.ref_genome_1, chromosome=Chromosome.objects.get( reference_genome=self.ref_genome_1), position=random.randint(101, 200), ref_value='A') vccd = VariantCallerCommonData.objects.create( variant=variant, source_dataset_id=1, alignment_group=alignment_group, data={}) for var_no_cov in range(20): variant = Variant.objects.create( type=Variant.TYPE.TRANSITION, reference_genome=self.ref_genome_1, chromosome=Chromosome.objects.get( reference_genome=self.ref_genome_1), position=random.randint(301, 400), ref_value='A') vccd = VariantCallerCommonData.objects.create( variant=variant, source_dataset_id=1, alignment_group=alignment_group, data={}) variant = Variant.objects.create( type=Variant.TYPE.TRANSITION, reference_genome=self.ref_genome_1, chromosome=Chromosome.objects.get( reference_genome=self.ref_genome_1), position=random.randint(501, 600), ref_value='A') vccd = VariantCallerCommonData.objects.create( variant=variant, source_dataset_id=1, alignment_group=alignment_group, data={}) new_bed_path = copy_dataset_to_entity_data_dir( entity=sample_alignment, original_source_location=TEST_BED) bed_dataset = add_dataset_to_entity( sample_alignment, dataset_label=Dataset.TYPE.BED_CALLABLE_LOCI, dataset_type=Dataset.TYPE.BED_CALLABLE_LOCI, filesystem_location=new_bed_path) vs_to_v_map = add_variants_to_set_from_bed(sample_alignment, bed_dataset) variant_set_labels = set([vs.label for vs in vs_to_v_map.keys()]) self.assertEqual(set(['POOR_MAPPING_QUALITY', 'NO_COVERAGE']), variant_set_labels) for variant_set, variants in vs_to_v_map.items(): for v in variants: # POOR MAPPING QUAL should be from 101 to 200 if variant_set.label == 'POOR_MAPPING_QUALITY': self.assertTrue(v.position in pyinter.closedopen(101, 200)) # NO COVERAGE should be from 301 to 400, 501 to 600 elif variant_set.label == 'NO_COVERAGE': self.assertTrue(v.position in pyinter.IntervalSet([ pyinter.closedopen(301, 400), pyinter.closedopen(501, 600) ])) else: raise AssertionError('bad variant set %s made.' % variant_set.label)
def combine_list_allformats(reference_genome_list, new_ref_genome_label, project): """Combine ReferenceGenomes into a new single ReferenceGenome composed of the component parts. Args: reference_genome_list: List of ReferenceGenome objects. new_ref_genome_label: Label for the new ReferenceGenome. project: Project to which the new ReferenceGenome will be added. Returns: Object with keys: * is_success * new_reference_genome (when is_success = True) * error_msg (when is_success = False) """ rg_dataset_list = [] for ref_genome in reference_genome_list: rg_dataset_tup = None for dataset_type in [Dataset.TYPE.REFERENCE_GENOME_GENBANK, Dataset.TYPE.REFERENCE_GENOME_FASTA]: filter_result = ref_genome.dataset_set.filter(type=dataset_type) if len(filter_result): rg_dataset_tup = (ref_genome, filter_result[0]) break if (not rg_dataset_tup or not os.path.exists(rg_dataset_tup[1].get_absolute_location())): return { 'is_success': False, 'error_msg': 'All reference genomes must have an associated \ FASTA or Genbank dataset' } else: rg_dataset_list.append(rg_dataset_tup) assert len(rg_dataset_list) == len(reference_genome_list) # Read the datasets into Biopython SeqRecord objects. rg_seqrecord_list = [] seqrecord_ids = [] for rg, dataset in rg_dataset_list: with open(dataset.get_absolute_location()) as input_fh: for record in SeqIO.parse(input_fh, DATASET_TO_SEQIO_FORMAT[dataset.type]): rg_seqrecord_list.append((rg,record)) seqrecord_ids.append('_'.join([rg.label[:7], record.id[:8]])) # If ReferenceGenome label and Chromosome id are the same, there will be # duplicate seqrecord_ids: resolve by including numeric prefix in id seq_record_list = [] MAX_LOCUS_NAME_LEN = 16 unique_id_len = len(str(len(seqrecord_ids))) label_len = (MAX_LOCUS_NAME_LEN - 2 - unique_id_len) / 2 for i,seqrecord_id in enumerate(seqrecord_ids): rg, seqrecord = rg_seqrecord_list[i] if seqrecord_ids.count(seqrecord_id) == 1: unique_seqrecord_id = seqrecord_id else: unique_seqrecord_id = '_'.join( [str(i), rg.label[:label_len], seqrecord.id[:label_len]]) seqrecord.name = seqrecord.id = unique_seqrecord_id seqrecord.seq.alphabet = ambiguous_dna seq_record_list.append(seqrecord) # Create a new ReferenceGenome. new_ref_genome = ReferenceGenome.objects.create( project=project, label=new_ref_genome_label, num_chromosomes=len(seq_record_list), num_bases=sum([len(seq) for seq in seq_record_list])) # Generate a filename from the label with non-alphanumeric characters # replaced by underscores. filename_prefix = generate_safe_filename_prefix_from_label( new_ref_genome_label) does_list_include_genbank = Dataset.TYPE.REFERENCE_GENOME_GENBANK in \ [rg_dataset_tup[1].type for rg_dataset_tup in rg_dataset_list] if does_list_include_genbank: filename = filename_prefix + '.gb' else: filename = filename_prefix + '.fa' new_file_dest = os.path.join(new_ref_genome.get_model_data_dir(), filename) # Write the result. ref_genome_dataset_type = Dataset.TYPE.REFERENCE_GENOME_GENBANK if \ does_list_include_genbank else Dataset.TYPE.REFERENCE_GENOME_FASTA output_file_format = DATASET_TO_SEQIO_FORMAT[ref_genome_dataset_type] with open(new_file_dest, 'w') as output_fh: SeqIO.write(seq_record_list, output_fh, output_file_format) # Create a dataset which will point to the file. This step must happen after # writing the file because a signal will be triggered which requires the # Genbank to exist already. add_dataset_to_entity(new_ref_genome, ref_genome_dataset_type, ref_genome_dataset_type, new_file_dest) return { 'is_success': True, 'new_reference_genome': new_ref_genome }
def test_end_to_end(self): """Test running full pipline on small-ish data. The data file consists of 20,000 bases. At 5,000 bases there is a 400 base deletion. At 10,000 bases there is a 400 base inversion. At 15,000 bases there is a 400 base tandem duplication. It seems that Pindel cannot find the inversion. Fortunately, delly can usually find inversions. Unfortunately, delly only works well on large data, so we will not test it here. """ # Create a new alignment group. alignment_group = AlignmentGroup.objects.create( label='test alignment', reference_genome=self.reference_genome) # Create a sample. sample_1 = ExperimentSample.objects.create( uid=TEST_SAMPLE_UID, project=self.project, label='sample1') ### Add the raw reads copy_and_add_dataset_source(sample_1, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ1, TEST_FASTQ1) copy_and_add_dataset_source(sample_1, Dataset.TYPE.FASTQ2, Dataset.TYPE.FASTQ2, TEST_FASTQ2) # Create relationship between alignment and sample. sample_alignment = ExperimentSampleToAlignment.objects.create( alignment_group=alignment_group, experiment_sample=sample_1) ### Add alignment data. NOTE: Stored in sample model dir. # index (no dataset) copy_dataset_to_entity_data_dir(sample_1, TEST_BAM_INDEX) # bam file (with dataset) copy_dest = copy_dataset_to_entity_data_dir(sample_1, TEST_BAM) add_dataset_to_entity(sample_alignment, Dataset.TYPE.BWA_ALIGN, Dataset.TYPE.BWA_ALIGN, copy_dest) # Make sure there are no variants before. self.assertEqual(0, len(Variant.objects.filter( reference_genome=self.reference_genome))) # Test with Pindel only for now. for tool in ['pindel']: find_variants_with_tool(alignment_group, VARIANT_TOOL_PARAMS_MAP[tool], project=self.project) # Check that the alignment group has a freebayes vcf dataset associated # with it. vcf_dataset = get_dataset_with_type(alignment_group, Dataset.TYPE.VCF_PINDEL) self.assertIsNotNone(vcf_dataset) # Make sure the .vcf file actually exists. self.assertTrue(os.path.exists(vcf_dataset.get_absolute_location())) # Make sure the vcf is valid by reading it using pyvcf. with open(vcf_dataset.get_absolute_location()) as vcf_fh: try: reader = vcf.Reader(vcf_fh) reader.next() except: self.fail("Not valid vcf") # Grab the resulting variants. variants = Variant.objects.filter(reference_genome=self.reference_genome) # Confirm that 2 variants found. self.assertEqual(2, len(variants)) variant_map = {} for variant in variants: variant_alternates = VariantAlternate.objects.filter(variant=variant) # There should be only one variant alternate per SV. self.assertEqual(len(variant_alternates), 1) pos = variant.position svtype = variant_alternates[0].data['INFO_SVTYPE'] svlen = variant_alternates[0].data['INFO_SVLEN'] variant_map[svtype] = (pos, svlen) # Check that there is a deletion around base 5000. self.assertTrue('DEL' in variant_map) self.assertTrue(abs(variant_map['DEL'][0] - 5000) <= 3) self.assertTrue(abs(variant_map['DEL'][1] - 400) <= 3) # Check that there is a tandem duplication around base 15000. self.assertTrue('DUP:TANDEM' in variant_map) self.assertTrue(abs(variant_map['DUP:TANDEM'][0] - 15000) <= 3) self.assertTrue(abs(variant_map['DUP:TANDEM'][1] - 400) <= 3)
def graph_contig_placement(contig_uid_list, skip_extracted_read_alignment, use_alignment_reads=True): """Align contigs passed in contig_list to the reference and to any annotated mobile elements in the reference genbank and use the alignment to build a sequence graph. The sequence graph is then used by graph walking algorithms that call structural variants from paths in the graph. Args: contig_uid_list: list of Contig objects skip_extracted_read_alignment: if False, extract the reads that assembled each contig and make them a bam track on the reference use_alignment_reads: if True, filter contig placements that would delete regions of moderate coverage Returns: placeable_contig_uid_list: Contig objects with metadata fields holding their reference placement parameters var_dict_list: list of dictionary representations of translocation variants with keys: chromosome, pos, ref_seq, alt_seq me_var_dict_list: list of dictionary representations of mobile element translocation variants with keys: chromosome, pos, ref_seq, alt_seq, MEINFO """ def _length_weighted_coverage(contig): return contig.num_bases * contig.coverage # Request contig_list from db and order by highest length weighted coverage contig_list = list(Contig.objects.filter(uid__in=contig_uid_list)) contig_list.sort(key=_length_weighted_coverage, reverse=True) sample_alignment = contig_list[0].experiment_sample_to_alignment sample_alignment.data['assembly_status'] = ( ExperimentSampleToAlignment.ASSEMBLY_STATUS.BUILDING_SEQUENCE_GRAPH ) sample_alignment.save() ref_genome = sample_alignment.alignment_group.reference_genome # Make Assembly dir assembly_dir = os.path.join(sample_alignment.get_model_data_dir(), 'assembly') contig_alignment_dir = os.path.join( assembly_dir, 'contig_alignment') if os.path.exists(contig_alignment_dir): shutil.rmtree(contig_alignment_dir) os.mkdir(contig_alignment_dir) # NOTE(gleb): Not sure whether these have to be ordered, but keeping # them ordered while refactoring. contigs_as_ordered_dict = OrderedDict( [(c.uid, c) for c in contig_list]) # Concatenate contig fastas for alignment contig_concat = os.path.join(contig_alignment_dir, 'contig_concat.fa') with open(contig_concat, 'w') as output_fh: for contig_uid, c in contigs_as_ordered_dict.iteritems(): contig_fasta_file = get_fasta(c) with open(contig_fasta_file) as read_fh: output_fh.write(read_fh.read()) # Create dictionaries to translate contig uid to its fasta descriptor line contig_qname_to_uid = {} for contig_uid, c in contigs_as_ordered_dict.items(): contig_fasta_file = get_fasta(c) with open(contig_fasta_file, 'r') as fh: descriptor = fh.next() contig_qname_to_uid[descriptor.strip('>\n')] = contig_uid # Get extracted mobile elements in addition to contigs if ref_genome.is_annotated(): me_fa_dataset = get_dataset_with_type( ref_genome, Dataset.TYPE.MOBILE_ELEMENT_FASTA) me_concat_fasta = me_fa_dataset.get_absolute_location() contig_alignment_to_me_bam = os.path.join( contig_alignment_dir, 'contig_alignment_to_me.bam') if not os.path.exists(contig_alignment_to_me_bam): ensure_bwa_index(me_concat_fasta) simple_align_with_bwa_mem( contig_concat, me_concat_fasta, contig_alignment_to_me_bam, ['-T', '15']) # Align concatenated contig fastas to reference contig_alignment_bam = os.path.join( contig_alignment_dir, 'contig_alignment.bam') print 'Aligning contigs to reference' simple_align_with_bwa_mem( contig_concat, get_fasta(ref_genome), contig_alignment_bam, ['-T', '15']) # Create graph G = nx.DiGraph() # Create sequence interval instances for reference and each contig ref_intervals = SequenceIntervals( ref_genome.uid, ref_genome.num_bases, tag='ref') G.ref_intervals = ref_intervals add_alignment_to_graph(G, contig_alignment_bam) if ref_genome.is_annotated(): add_me_alignment_to_graph(G, contig_alignment_to_me_bam) # Add SEQUENCE_GRAPH_PICKLE dataset to sample alignment graph_pickle_path = os.path.join( contig_alignment_dir, 'sequence_graph.pickle') nx.write_gpickle(G, graph_pickle_path) add_dataset_to_entity( sample_alignment, Dataset.TYPE.SEQUENCE_GRAPH_PICKLE, Dataset.TYPE.SEQUENCE_GRAPH_PICKLE, graph_pickle_path) detect_strand_chromosome_junctions(contig_qname_to_uid, contig_alignment_bam) placeable_contig_uid_list = [] iv_list = novel_seq_ins_walk(G) if use_alignment_reads: coverage_stats = get_coverage_stats(sample_alignment) sample_alignment_bam = sample_alignment.dataset_set.get( type=Dataset.TYPE.BWA_ALIGN).get_absolute_location() for insertion_vertices in iv_list: contig_qname = insertion_vertices.enter_contig.seq_uid contig_uid = contig_qname_to_uid[contig_qname] contig = Contig.objects.get(uid=contig_uid) set_contig_placement_params(contig, insertion_vertices) if use_alignment_reads: # Filter out deletions of good coverage regions deletion_length = (insertion_vertices.enter_ref.pos - insertion_vertices.exit_ref.pos) if deletion_length > 0: deletion_cov = avg_coverage( sample_alignment_bam, contig.metadata['chromosome'], insertion_vertices.exit_ref.pos, insertion_vertices.enter_ref.pos) chrom_cov_stats = coverage_stats[contig.metadata['chromosome']] chrom_cov_mean = chrom_cov_stats['mean'] chrom_cov_std = chrom_cov_stats['std'] if deletion_length <= 0 or ( deletion_cov < chrom_cov_mean - chrom_cov_std): placeable_contig_uid_list.append(contig.uid) else: placeable_contig_uid_list.append(contig.uid) # Perform translocation walk if ref_genome.num_chromosomes == 1: trans_iv_pairs = translocation_walk(G) var_dict_list = [parse_path_into_ref_alt(iv_pair, contig_qname_to_uid, sample_alignment) for iv_pair in trans_iv_pairs] var_dict_list = [var_d for var_d in var_dict_list if any([var_d['ref_seq'], var_d['alt_seq']])] if ref_genome.is_annotated(): me_trans_iv_pairs = me_translocation_walk(G) me_var_dict_list = [parse_path_into_ref_alt( iv_pair, contig_qname_to_uid, sample_alignment) for iv_pair in me_trans_iv_pairs] else: me_var_dict_list = [] else: print 'Translocation walk not implemented for multi-chromosomal refs' var_dict_list = [] me_var_dict_list = [] return placeable_contig_uid_list, var_dict_list, me_var_dict_list
def assemble_with_velvet(assembly_dir, velvet_opts, sv_indicants_bam, sample_alignment, overwrite=True, reassemble_contig_from_reads=False): # NOTE: Unused. If enabled, will call make_contig_reads_to_ref_alignments() # which is not used anywhere currently due to performance issues which # which are particularly bad when many unused reads. assert not reassemble_contig_from_reads timestamp = str(datetime.datetime.now()) contig_number_pattern = re.compile('^NODE_(\d+)_') reference_genome = sample_alignment.alignment_group.reference_genome contig_files = [] contig_uid_list = [] _run_velvet(assembly_dir, velvet_opts, sv_indicants_bam) # Collect resulting contigs fasta contigs_fasta = os.path.join(assembly_dir, 'contigs.fa') contig_files.append(contigs_fasta) records = list(SeqIO.parse(contigs_fasta, 'fasta')) digits = len(str(len(records))) + 1 for (i, seq_record) in enumerate(records, 1): # Extract contig sequence from the contigs.fa file, number, and # name it. contig_node_number = int( contig_number_pattern.findall(seq_record.description)[0]) coverage = float(seq_record.description.rsplit('_', 1)[1]) seq_record.seq = reduce(lambda x, y: x + y, [seq for seq in seq_record.seq.split('N')]) seq_record.id = seq_record.name = seq_record.description = ('NODE_' + str(i)) leading_zeros = digits - len(str(i)) contig_label = '%s_%s' % (sample_alignment.experiment_sample.label, leading_zeros * '0' + str(i)) # Create model and metadata. contig = Contig.objects.create( label=contig_label, parent_reference_genome=reference_genome, experiment_sample_to_alignment=(sample_alignment)) contig.metadata['coverage'] = coverage contig.metadata['timestamp'] = timestamp contig.metadata['node_number'] = contig_node_number contig.metadata['assembly_dir'] = assembly_dir contig.ensure_model_data_dir_exists() # NOTE: Unused code. # Reassemble the contig from its constituent reads separately, # using a second velvet call. # if reassemble_contig_from_reads: # # 1. Grab reads from velvet to reassemble the contig # make_contig_reads_to_ref_alignments(contig, # add_jbrowse_track=False, overwrite=overwrite) # contig_reads_bam = os.path.join( # contig.get_model_data_dir(), # 'sv_indicants.bam') # # 2. Reassemble the contig from its whole reads using velvet - # # this generates longer contigs because the graph will trim the # # edges if there is a branchpoint. With only one node it should # # be very fast. # _run_velvet(contig.get_model_data_dir(), velvet_opts, # contig_reads_bam) # reassembled_seqrecord = _extract_node_from_contig_reassembly( # contig) # if reassembled_seqrecord: # seq_record.seq = reassembled_seqrecord.seq # Write the contig fasta and add it as a dataset to the contig object. dataset_path = os.path.join(contig.get_model_data_dir(), 'fasta.fa') with open(dataset_path, 'w') as fh: SeqIO.write([seq_record], fh, 'fasta') add_dataset_to_entity(contig, 'contig_fasta', Dataset.TYPE.REFERENCE_GENOME_FASTA, filesystem_location=dataset_path) contig.save() # NOTE: Disabled for now. Severe performance issues. # Make a bam track on the reference for each contig that shows only the # reads that assembled the contig and their mates # make_contig_reads_to_ref_alignments(contig.uid) # append the uid to the contig_uid_list contig_uid_list.append(contig.uid) # once contigs are extracted, remove velvet data _cleanup_velvet_dir(assembly_dir) return contig_uid_list
def generate_contigs(sample_alignment, sv_read_classes={}, input_velvet_opts={}, overwrite=True): """Generates contigs. """ # Don't proceed if processing this sample alignment previously failed or # in another async process. sample_alignment = ExperimentSampleToAlignment.objects.get( uid=sample_alignment.uid) if (sample_alignment.data.get('assembly_status') == ExperimentSampleToAlignment.ASSEMBLY_STATUS.FAILED): return # Set assembly status for UI # NOTE: Setting this status is playing whack-a-mole against other async sv # detection functions, e.g. detect_deletion.cov_detect_deletion_make_vcf(). set_assembly_status( sample_alignment, ExperimentSampleToAlignment.ASSEMBLY_STATUS.ASSEMBLING) print 'Generating contigs\n' # Grab reference genome fasta path and ensure exists. reference_genome = sample_alignment.alignment_group.reference_genome reference_genome.dataset_set.get_or_create( type=Dataset.TYPE.REFERENCE_GENOME_FASTA)[0] # Make assembly_dir directory to house genome_finishing files assembly_dir = os.path.join( sample_alignment.get_model_data_dir(), 'assembly') # Make assembly directory if it does not exist, and remove it if it does if os.path.exists(assembly_dir): shutil.rmtree(assembly_dir) os.mkdir(assembly_dir) # Get a bam of sorted SV indicants with pairs sv_indicants_bam = get_sv_indicating_reads(sample_alignment, sv_read_classes, overwrite=overwrite) prev_dataset = get_dataset_with_type( sample_alignment, Dataset.TYPE.BWA_FOR_DE_NOVO_ASSEMBLY) if overwrite and prev_dataset: prev_dataset.delete() if overwrite or prev_dataset is None: sv_indicants_sorted_bam = (os.path.splitext(sv_indicants_bam)[0] + '.coordinate_sorted.bam') # Bam needs to be coordinated sorted to index sort_bam_by_coordinate(sv_indicants_bam, sv_indicants_sorted_bam) # Bam needs to be indexed for jbrowse index_bam(sv_indicants_sorted_bam) for_assembly_dataset = add_dataset_to_entity( sample_alignment, Dataset.TYPE.BWA_FOR_DE_NOVO_ASSEMBLY, Dataset.TYPE.BWA_FOR_DE_NOVO_ASSEMBLY, filesystem_location=sv_indicants_sorted_bam) for_assembly_dataset.save() # TODO(dbgoodman): Look into re-enabling this. Right now, this creates # thousands of tracks and appears to significantly slow down JBrowse. # add_bam_file_track(reference_genome, # sample_alignment, Dataset.TYPE.BWA_FOR_DE_NOVO_ASSEMBLY) velvet_opts = dict(DEFAULT_VELVET_OPTS) # Find insertion metrics ins_length, ins_length_sd = get_insert_size_mean_and_stdev( sample_alignment) velvet_opts['velvetg']['ins_length'] = ins_length velvet_opts['velvetg']['ins_length_sd'] = ins_length_sd # Find expected coverage avg_read_coverage = get_avg_genome_coverage( sample_alignment) # Calculate expected coverage in kmers genome_kmer_coverage = kmer_coverage(avg_read_coverage, ins_length, velvet_opts['velveth']['hash_length']) exp_cov = genome_kmer_coverage * VELVET_CONTIG_COVERAGE_EXPECTED velvet_opts['velvetg']['exp_cov'] = exp_cov # # Set cov cutoff cov_cutoff = genome_kmer_coverage * VELVET_CONTIG_COVERAGE_CUTOFF velvet_opts['velvetg']['cov_cutoff'] = cov_cutoff # Update velvet_opts with input_velvet_opts for shallow_key in ['velveth', 'velvetg']: if shallow_key in input_velvet_opts: for deep_key in input_velvet_opts[shallow_key]: velvet_opts[shallow_key][deep_key] = ( input_velvet_opts[shallow_key][deep_key]) # Perform velvet assembly and generate contig objects. contig_uid_list = assemble_with_velvet( assembly_dir, velvet_opts, sv_indicants_bam, sample_alignment, overwrite=overwrite) # Evaluate contigs for mapping. evaluate_contigs(contig_uid_list) # Update status again if not FAILED. sample_alignment = ExperimentSampleToAlignment.objects.get( uid=sample_alignment.uid) if (sample_alignment.data.get('assembly_status') != ExperimentSampleToAlignment.ASSEMBLY_STATUS.FAILED): set_assembly_status( sample_alignment, ExperimentSampleToAlignment.ASSEMBLY_STATUS.WAITING_TO_PARSE)
def test_run_alignment_with_spaces_in_genbank_filename(self): project = self.common_entities['project'] ref_genome_label = 'dirty_upload' request = HttpRequest() request.POST = { 'projectUid': project.uid, 'refGenomeLabel': ref_genome_label, 'importFileFormat': 'genbank' } request.method = 'POST' request.user = self.common_entities['user'] authenticate(username=TEST_USERNAME, password=TEST_PASSWORD) self.assertTrue(request.user.is_authenticated()) request.FILES['refGenomeFile'] = UploadedFile( file=open(TEST_GENBANK), name='dirty_genbank (spaces).gb') response = create_ref_genome_from_browser_upload(request) self.assertEqual(STATUS_CODE__SUCCESS, response.status_code) self.assertFalse(json.loads(response.content).get('error', False)) # Get reference genome ref_genome = ReferenceGenome.objects.get( project=project, label=ref_genome_label) # Create sample model sample = ExperimentSample.objects.create( project=project, label='test_sample') # Add fastq datasets to sample add_dataset_to_entity( sample, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ1, filesystem_location=TEST_DIRTY_FQ_1) # Add fastq datasets to sample add_dataset_to_entity( sample, Dataset.TYPE.FASTQ2, Dataset.TYPE.FASTQ2, filesystem_location=TEST_DIRTY_FQ_2) # Run alignment of sample to reference alignment_group_label = 'test_alignment' sample_list = [sample] result = run_pipeline( alignment_group_label, ref_genome, sample_list) alignment_group = result[0] alignment_async_result = result[1] variant_calling_async_result = result[2] alignment_async_result.get() variant_calling_async_result.get() alignment_group = AlignmentGroup.objects.get(uid=alignment_group.uid) self.assertEqual(AlignmentGroup.STATUS.COMPLETED, alignment_group.status)
def compute_insert_metrics(bam_file, sample_alignment, stderr=None): """Computes read fragment insert size distribution. Creates a Dataset for each of: * histogram file * file with mean and stdev comma-separated Raises: ValueError if calculating paired-end distribution failed. """ histo_file = os.path.splitext(bam_file)[0] + '.insert_size_histogram.txt' mean_stdev_file = (os.path.splitext(bam_file)[0] + '.insert_size_mean_stdev.txt') # First, we analyze the bam distribution. read_bam_cmd = [settings.SAMTOOLS_BINARY, 'view', bam_file] p1 = Popen(read_bam_cmd, stdout=PIPE, stderr=stderr) read_length = get_read_length(bam_file) pairend_distro_cmd = [ settings.LUMPY_PAIREND_DISTRO_BIN, '-r', str(read_length), '-X', '4', # num stdevs from end to extend '-N', '10000', # number to sample '-o', histo_file ] p2 = Popen(pairend_distro_cmd, stdin=p1.stdout, stdout=PIPE, stderr=stderr) # Allow p1 to receive a SIGPIPE if p2 exits. p1.stdout.close() # Run the command and get mean, stdev mean_and_stdev_str = p2.communicate()[0] mean_and_stdev_parts = mean_and_stdev_str.split('\t') if len(mean_and_stdev_parts) != 2: raise ValueError( "Poor alignment. Perhaps you tried aligning to the wrong reference " "genome?") raw_mean, raw_stdev = mean_and_stdev_parts mean = int(float(raw_mean.split(':')[1].strip())) stdev = int(float(raw_stdev.split(':')[1].strip())) # Lumpy doesn't like stdev of 0. if stdev < 1: stdev = 1 # Save the histogram file as a Dataset. add_dataset_to_entity(sample_alignment, Dataset.TYPE.LUMPY_INSERT_METRICS_HISTOGRAM, Dataset.TYPE.LUMPY_INSERT_METRICS_HISTOGRAM, filesystem_location=histo_file) # Write mean, stdev to another file and create another Dataset. with open(mean_stdev_file, 'w') as fh: fh.write("%d,%d" % (mean, stdev)) add_dataset_to_entity(sample_alignment, Dataset.TYPE.LUMPY_INSERT_METRICS_MEAN_STDEV, Dataset.TYPE.LUMPY_INSERT_METRICS_MEAN_STDEV, filesystem_location=mean_stdev_file)
def combine_list_allformats(reference_genome_list, new_ref_genome_label, project): """Combine ReferenceGenomes into a new single ReferenceGenome composed of the component parts. Args: reference_genome_list: List of ReferenceGenome objects. new_ref_genome_label: Label for the new ReferenceGenome. project: Project to which the new ReferenceGenome will be added. Returns: Object with keys: * is_success * new_reference_genome (when is_success = True) * error_msg (when is_success = False) """ rg_dataset_list = [] for ref_genome in reference_genome_list: rg_dataset_tup = None for dataset_type in [ Dataset.TYPE.REFERENCE_GENOME_GENBANK, Dataset.TYPE.REFERENCE_GENOME_FASTA ]: filter_result = ref_genome.dataset_set.filter(type=dataset_type) if len(filter_result): rg_dataset_tup = (ref_genome, filter_result[0]) break if (not rg_dataset_tup or not os.path.exists(rg_dataset_tup[1].get_absolute_location())): return { 'is_success': False, 'error_msg': 'All reference genomes must have an associated \ FASTA or Genbank dataset' } else: rg_dataset_list.append(rg_dataset_tup) assert len(rg_dataset_list) == len(reference_genome_list) # Read the datasets into Biopython SeqRecord objects. rg_seqrecord_list = [] seqrecord_ids = [] seqrecord_descriptions = [] for rg, dataset in rg_dataset_list: with open(dataset.get_absolute_location()) as input_fh: for record in SeqIO.parse(input_fh, DATASET_TO_SEQIO_FORMAT[dataset.type]): rg_seqrecord_list.append((rg, record)) seqrecord_ids.append('_'.join([ remove_whitespace(rg.label)[:7], remove_whitespace(record.id)[:8] ])) seqrecord_descriptions.append(record.description) # Create a new ReferenceGenome. new_ref_genome = ReferenceGenome.objects.create(project=project, label=new_ref_genome_label) # If ReferenceGenome label and Chromosome id are the same, there will be # duplicate seqrecord_ids: resolve by including numeric prefix in id seq_record_list = [] MAX_LOCUS_NAME_LEN = 16 unique_id_len = len(str(len(seqrecord_ids))) label_len = (MAX_LOCUS_NAME_LEN - 2 - unique_id_len) / 2 for i, seqrecord_id in enumerate(seqrecord_ids): rg, seqrecord = rg_seqrecord_list[i] if seqrecord_ids.count(seqrecord_id) == 1: unique_seqrecord_id = seqrecord_id else: unique_seqrecord_id = '_'.join([ str(i), remove_whitespace(rg.label)[:label_len], remove_whitespace(seqrecord.id)[:label_len] ]) seqrecord.seq.alphabet = ambiguous_dna seqrecord.name = unique_seqrecord_id seqrecord.id = unique_seqrecord_id if seqrecord_descriptions.count(seqrecord.description) > 1: seqrecord.description = ' '.join( [seqrecord.description, 'from Reference Genome:', rg.label]) seq_record_list.append(seqrecord) Chromosome.objects.create(reference_genome=new_ref_genome, label=seqrecord.id, seqrecord_id=seqrecord.id, num_bases=len(seqrecord)) # Generate a filename from the label with non-alphanumeric characters # replaced by underscores. filename_prefix = generate_safe_filename_prefix_from_label( new_ref_genome_label) does_list_include_genbank = (Dataset.TYPE.REFERENCE_GENOME_GENBANK in [ rg_dataset_tup[1].type for rg_dataset_tup in rg_dataset_list ]) if does_list_include_genbank: filename = filename_prefix + '.gb' else: filename = filename_prefix + '.fa' new_file_dest = os.path.join(new_ref_genome.get_model_data_dir(), filename) # Write the result. if does_list_include_genbank: ref_genome_dataset_type = Dataset.TYPE.REFERENCE_GENOME_GENBANK else: ref_genome_dataset_type = Dataset.TYPE.REFERENCE_GENOME_FASTA output_file_format = DATASET_TO_SEQIO_FORMAT[ref_genome_dataset_type] with open(new_file_dest, 'w') as output_fh: SeqIO.write(seq_record_list, output_fh, output_file_format) # Create a dataset which will point to the file. This step must happen # after writing the file because a signal will be triggered which requires # the Genbank to exist already. add_dataset_to_entity(new_ref_genome, ref_genome_dataset_type, ref_genome_dataset_type, new_file_dest) return {'is_success': True, 'new_reference_genome': new_ref_genome}
def bootstrap_fake_data(): """Fill the database with fake data. """ user = get_or_create_user() ### Create some projects (test_project, project_created) = Project.objects.get_or_create(title=TEST_PROJECT_NAME, owner=user.get_profile()) (test_project_2, project_created) = Project.objects.get_or_create(title=SV_PROJECT_NAME, owner=user.get_profile()) ### Create some reference genomes ref_genome_1 = import_reference_genome_from_local_file( test_project, REF_GENOME_1_LABEL, TEST_FASTA, 'fasta') ref_genome_2 = import_reference_genome_from_local_file( test_project, REF_GENOME_2_LABEL, TEST_FASTA, 'fasta') ref_genome_3 = import_reference_genome_from_local_file( test_project, 'test_genome', TEST_FASTA, 'fasta') ### Create some saved queries. for saved_query_text in CUSTOM_SAVED_QUERY_LIST: SavedVariantFilterQuery.objects.get_or_create(owner=user.get_profile(), text=saved_query_text) ### Create some ExperimentSamples. # Create some samples without backing data just to explore the UI. ExperimentSample.objects.create(project=test_project, label='C321D_MiSeq', data={'SAMPLE_WELL': 'A01'}) ExperimentSample.objects.create(project=test_project, label='C321D Fixed 01', data={'SAMPLE_WELL': 'A02'}) ExperimentSample.objects.create(project=test_project, label='C321D Fixed 02', data={'SAMPLE_WELL': 'A03'}) # Create some samples with backing data. (sample_1, created) = ExperimentSample.objects.get_or_create(project=test_project, label=SAMPLE_1_LABEL) # Add datasets to the samples. if not sample_1.dataset_set.filter(type=Dataset.TYPE.FASTQ1): copy_and_add_dataset_source(sample_1, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ1, TEST_FASTQ1) if not sample_1.dataset_set.filter(type=Dataset.TYPE.FASTQ2): copy_and_add_dataset_source(sample_1, Dataset.TYPE.FASTQ2, Dataset.TYPE.FASTQ2, TEST_FASTQ2) # Create sample backed by g-zipped data. gz_backed_sample = ExperimentSample.objects.create( project=test_project, label='sample backed by gz data') gz_fastq1_dataset = copy_and_add_dataset_source(gz_backed_sample, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ1, TEST_FASTQ_GZ_1) gz_fastq2_dataset = copy_and_add_dataset_source(gz_backed_sample, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ2, TEST_FASTQ_GZ_2) run_fastqc_on_sample_fastq(gz_backed_sample, gz_fastq1_dataset) run_fastqc_on_sample_fastq(gz_backed_sample, gz_fastq2_dataset, rev=True) ### Create an alignment. alignment_group_1 = AlignmentGroup.objects.create( label='Alignment 1', reference_genome=ref_genome_3, aligner=AlignmentGroup.ALIGNER.BWA) # Link it to a sample. sample_alignment = ExperimentSampleToAlignment.objects.create( alignment_group=alignment_group_1, experiment_sample=sample_1) ### Add alignment data. NOTE: Stored in sample model dir. # NOTE: This is a bit convoluted. Perhaps it would be better to store alignments # in the ExperimentSampleToAlignment directory. copy_dest = copy_dataset_to_entity_data_dir(sample_1, TEST_BAM) copy_dataset_to_entity_data_dir(sample_1, TEST_BAM_INDEX) add_dataset_to_entity(sample_alignment, Dataset.TYPE.BWA_ALIGN, Dataset.TYPE.BWA_ALIGN, copy_dest) # Create fake variants. create_fake_variants_and_variant_sets(ref_genome_1) ############################# # Full VCF Testing (annotated for snpeff, variant filtering, etc) ############################# # Create a new reference genome and samples using full_vcf_test_set full_vcf_reference_genome = import_reference_genome_from_local_file( test_project, 'mg1655_tolC_through_zupT', FullVCFTestSet.TEST_GENBANK, 'genbank') # Create all samples. parent_obj = None full_vcf_samples = [] for i in range(FullVCFTestSet.NUM_SAMPLES): sample_obj = ExperimentSample.objects.create(project=test_project, label='Sample %d' % i) sample_obj.data['SAMPLE_WELL'] = 'A0%d' % (i + 1) if i == 0: parent_obj = sample_obj if i > 0: sample_obj.data['SAMPLE_PARENTS'] = parent_obj.label parent_obj.add_child(sample_obj) parent_obj.save() sample_obj.save() # Add raw reads to each sample. fastq1_dataset = copy_and_add_dataset_source(sample_obj, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ1, FullVCFTestSet.FASTQ1[i]) fastq2_dataset = copy_and_add_dataset_source(sample_obj, Dataset.TYPE.FASTQ2, Dataset.TYPE.FASTQ2, FullVCFTestSet.FASTQ2[i]) # Run FASTQC on sample reads. run_fastqc_on_sample_fastq(sample_obj, fastq1_dataset) run_fastqc_on_sample_fastq(sample_obj, fastq2_dataset, rev=True) full_vcf_samples.append(sample_obj) # Run the alignment. Return the alignment group created, indexed by the # reference genome's uid. run_pipeline('test_align', full_vcf_reference_genome, full_vcf_samples) import_variant_set_from_vcf(full_vcf_reference_genome, 'Designed', FullVCFTestSet.TEST_DESIGNED_SNPS) def _create_region_intervals(region, interval_tuple_list): """Helper method to create RegionIntervals for a Region. Args: region: Region Model object. interval_tuple_list: List of tuples of intervals to create. """ for interval in interval_tuple_list: RegionInterval.objects.create(region=region, start=interval[0], end=interval[1]) # Create some fake regions. # TODO: Should not be much harder to replace this with real regions. region_1 = Region.objects.create( reference_genome=full_vcf_reference_genome, label='region_1', type=Region.TYPE.POOR_MAPPING_QUALITY) _create_region_intervals(region_1, [(1, 150), (300, 400), (500, 900)]) region_2 = Region.objects.create( reference_genome=full_vcf_reference_genome, label='region_2', type=Region.TYPE.POOR_MAPPING_QUALITY) _create_region_intervals(region_2, [(1000, 1500)]) region_3 = Region.objects.create( reference_genome=full_vcf_reference_genome, label='region_3', type=Region.TYPE.POOR_MAPPING_QUALITY) _create_region_intervals(region_3, [(1800, 1900), (2150, 2300)]) # And some GENE regions. gene_A = Region.objects.create(reference_genome=full_vcf_reference_genome, label='geneA', type=Region.TYPE.GENE) _create_region_intervals(gene_A, [(2000, 2400)]) gene_B = Region.objects.create(reference_genome=full_vcf_reference_genome, label='geneB', type=Region.TYPE.GENE) _create_region_intervals(gene_B, [(4800, 5200)]) gene_C = Region.objects.create(reference_genome=full_vcf_reference_genome, label='geneC', type=Region.TYPE.GENE) _create_region_intervals(gene_C, [(1, 500)]) # Bootstrap test_project_2 with SV stuff sv_testing_bootstrap(test_project_2)
def generate_contigs(sample_alignment, sv_read_classes={}, input_velvet_opts={}, overwrite=True): """Generates contigs. """ # Don't proceed if processing this sample alignment previously failed or # in another async process. sample_alignment = ExperimentSampleToAlignment.objects.get( uid=sample_alignment.uid) if (sample_alignment.data.get('assembly_status') == ExperimentSampleToAlignment.ASSEMBLY_STATUS.FAILED): return # Set assembly status for UI # NOTE: Setting this status is playing whack-a-mole against other async sv # detection functions, e.g. detect_deletion.cov_detect_deletion_make_vcf(). set_assembly_status(sample_alignment, ExperimentSampleToAlignment.ASSEMBLY_STATUS.ASSEMBLING) print 'Generating contigs\n' # Grab reference genome fasta path and ensure exists. reference_genome = sample_alignment.alignment_group.reference_genome reference_genome.dataset_set.get_or_create( type=Dataset.TYPE.REFERENCE_GENOME_FASTA)[0] # Make assembly_dir directory to house genome_finishing files assembly_dir = os.path.join(sample_alignment.get_model_data_dir(), 'assembly') # Make assembly directory if it does not exist, and remove it if it does if os.path.exists(assembly_dir): shutil.rmtree(assembly_dir) os.mkdir(assembly_dir) # Get a bam of sorted SV indicants with pairs sv_indicants_bam = get_sv_indicating_reads(sample_alignment, sv_read_classes, overwrite=overwrite) prev_dataset = get_dataset_with_type(sample_alignment, Dataset.TYPE.BWA_FOR_DE_NOVO_ASSEMBLY) if overwrite and prev_dataset: prev_dataset.delete() if overwrite or prev_dataset is None: sv_indicants_sorted_bam = (os.path.splitext(sv_indicants_bam)[0] + '.coordinate_sorted.bam') # Bam needs to be coordinated sorted to index sort_bam_by_coordinate(sv_indicants_bam, sv_indicants_sorted_bam) # Bam needs to be indexed for jbrowse index_bam(sv_indicants_sorted_bam) for_assembly_dataset = add_dataset_to_entity( sample_alignment, Dataset.TYPE.BWA_FOR_DE_NOVO_ASSEMBLY, Dataset.TYPE.BWA_FOR_DE_NOVO_ASSEMBLY, filesystem_location=sv_indicants_sorted_bam) for_assembly_dataset.save() # TODO(dbgoodman): Look into re-enabling this. Right now, this creates # thousands of tracks and appears to significantly slow down JBrowse. # add_bam_file_track(reference_genome, # sample_alignment, Dataset.TYPE.BWA_FOR_DE_NOVO_ASSEMBLY) velvet_opts = dict(DEFAULT_VELVET_OPTS) # Find insertion metrics ins_length, ins_length_sd = get_insert_size_mean_and_stdev( sample_alignment) velvet_opts['velvetg']['ins_length'] = ins_length velvet_opts['velvetg']['ins_length_sd'] = ins_length_sd # Find expected coverage avg_read_coverage = get_avg_genome_coverage(sample_alignment) # Calculate expected coverage in kmers genome_kmer_coverage = kmer_coverage(avg_read_coverage, ins_length, velvet_opts['velveth']['hash_length']) exp_cov = genome_kmer_coverage * VELVET_CONTIG_COVERAGE_EXPECTED velvet_opts['velvetg']['exp_cov'] = exp_cov # # Set cov cutoff cov_cutoff = genome_kmer_coverage * VELVET_CONTIG_COVERAGE_CUTOFF velvet_opts['velvetg']['cov_cutoff'] = cov_cutoff # Update velvet_opts with input_velvet_opts for shallow_key in ['velveth', 'velvetg']: if shallow_key in input_velvet_opts: for deep_key in input_velvet_opts[shallow_key]: velvet_opts[shallow_key][deep_key] = ( input_velvet_opts[shallow_key][deep_key]) # Perform velvet assembly and generate contig objects. contig_uid_list = assemble_with_velvet(assembly_dir, velvet_opts, sv_indicants_bam, sample_alignment, overwrite=overwrite) # Evaluate contigs for mapping. evaluate_contigs(contig_uid_list) # Update status again if not FAILED. sample_alignment = ExperimentSampleToAlignment.objects.get( uid=sample_alignment.uid) if (sample_alignment.data.get('assembly_status') != ExperimentSampleToAlignment.ASSEMBLY_STATUS.FAILED): set_assembly_status( sample_alignment, ExperimentSampleToAlignment.ASSEMBLY_STATUS.WAITING_TO_PARSE)