def order_references( subread_file, reference_file ): """ Select the two best reference sequences from a list """ log.info("Selecting the best references sequences to use") temp = 'temp.m1' if not valid_file( temp ): align_best_reference( subread_file, reference_file, temp ) c = Counter([hit.tname for hit in BlasrReader(temp)]) return [k for k, v in c.most_common()]
def sort_subreads( subread_file, reference_file ): """ Aligning """ log.info("Aligning subreads to the two best references") temp = 'temp2.m1' if valid_file( temp ): return {hit.qname: hit.tname for hit in BlasrReader(temp)} align_best_reference( subread_file, reference_file, temp ) return {hit.qname: hit.tname for hit in BlasrReader(temp)}
def sort_subreads(subread_file, reference_file): """ Aligning """ log.info("Aligning subreads to the two best references") temp = 'temp2.m1' if valid_file(temp): return {hit.qname: hit.tname for hit in BlasrReader(temp)} align_best_reference(subread_file, reference_file, temp) return {hit.qname: hit.tname for hit in BlasrReader(temp)}
def order_references(subread_file, reference_file): """ Select the two best reference sequences from a list """ log.info("Selecting the best references sequences to use") temp = 'temp.m1' if not valid_file(temp): align_best_reference(subread_file, reference_file, temp) c = Counter([hit.tname for hit in BlasrReader(temp)]) return [k for k, v in c.most_common()]
def align_contigs_to_genome(self, contig_file): log.info("Looking for Contig-to-Genome alignment data") contig_genome_align = self.get_filepath( 'alignments', 'contigs_to_genome.m1' ) if valid_file( contig_genome_align ): log.info("Using existing Contig->Genome alignment file\n") else: log.info("No Contig->Genome alignment found, creating...") align_best_reference( contig_file, self.human_reference, output=contig_genome_align ) check_output_file( contig_genome_align ) log.info("Finished aligning contigs to the genomic reference\n") return create_m1_reference( contig_genome_align )
def _align_sequences(query, reference): """ Align one fasta file of sequences to another """ temp = NamedTemporaryFile(suffix='.m1', delete=False) align_best_reference(query, reference, output=temp.name) if valid_file(temp.name): hits = list(BlasrReader(temp.name)) os.unlink(temp.name) return hits os.unlink(temp.name) return None
def align_contigs_to_reference(self, contig_file, reference_file): """ Align HBAR contigs to an HLA reference Fasta """ log.info("Looking for Contig-to-Reference alignment data") contig_reference_align = self.get_filepath( 'alignments', 'contigs_to_reference.m1' ) if valid_file( contig_reference_align ): log.info("Using an existing Contig->Reference alignment file\n") else: log.info("No Contig->Reference alignment found, creating...") align_best_reference( contig_file, reference_file, output=contig_reference_align ) check_output_file( contig_reference_align ) log.info("Finished aligning contigs to the HLA reference data\n") return create_m1_reference( contig_reference_align )
def align_subreads_to_contigs(self, subread_file, contig_file ): """ Align the subreads to the contigs assembled by HBAR """ log.info("Looking for Subread-to-Contig alignment data") subread_contig_align = self.get_filepath( 'alignments', 'subreads_to_contigs.m1' ) if valid_file( subread_contig_align ): log.info("Using existing Subread->Contig alignment file\n") else: log.info("No Subread->Contig alignment found, creating...") align_best_reference( subread_file, contig_file, output=subread_contig_align ) check_output_file( subread_contig_align ) log.info("Finished aligning subreads to the HBAR contigs\n") return create_m1_reference( subread_contig_align )
def align_contigs_to_genome(self, contig_file): log.info("Looking for Contig-to-Genome alignment data") contig_genome_align = self.get_filepath('alignments', 'contigs_to_genome.m1') if valid_file(contig_genome_align): log.info("Using existing Contig->Genome alignment file\n") else: log.info("No Contig->Genome alignment found, creating...") align_best_reference(contig_file, self.human_reference, output=contig_genome_align) check_output_file(contig_genome_align) log.info("Finished aligning contigs to the genomic reference\n") return create_m1_reference(contig_genome_align)
def create_chimeras(input_file, output=None, reference_file=None, alignment_file=None): """ Pick the top N Amplicon Analysis consensus seqs from a Fasta by Nreads """ # Check the input files, and align the input file if needed if reference_file and alignment_file is None: alignment_file = align_best_reference(input_file, reference_file) elif reference_file is None and alignment_file is None: msg = "extract_alleles requires either an Alignment or a Reference!" log.error(msg) raise IOError(msg) # Set the output file if not specified if output is None: basename = '.'.join(input_file.split('.')[:-1]) output = '%s.chimeras.fasta' % basename # Parse the alignment data and extract the target sequences alignments = list(BlasrReader(alignment_file)) groups = _group_by_locus(alignments) groups = _filter_groups(groups) sequences = list(FastaReader(input_file)) chimeras = list(_create_chimeras(groups, sequences)) write_fasta(chimeras, output) return output
def align_contigs_to_reference(self, contig_file, reference_file): """ Align HBAR contigs to an HLA reference Fasta """ log.info("Looking for Contig-to-Reference alignment data") contig_reference_align = self.get_filepath('alignments', 'contigs_to_reference.m1') if valid_file(contig_reference_align): log.info("Using an existing Contig->Reference alignment file\n") else: log.info("No Contig->Reference alignment found, creating...") align_best_reference(contig_file, reference_file, output=contig_reference_align) check_output_file(contig_reference_align) log.info("Finished aligning contigs to the HLA reference data\n") return create_m1_reference(contig_reference_align)
def align_subreads_to_contigs(self, subread_file, contig_file): """ Align the subreads to the contigs assembled by HBAR """ log.info("Looking for Subread-to-Contig alignment data") subread_contig_align = self.get_filepath('alignments', 'subreads_to_contigs.m1') if valid_file(subread_contig_align): log.info("Using existing Subread->Contig alignment file\n") else: log.info("No Subread->Contig alignment found, creating...") align_best_reference(subread_file, contig_file, output=subread_contig_align) check_output_file(subread_contig_align) log.info("Finished aligning subreads to the HBAR contigs\n") return create_m1_reference(subread_contig_align)
def type_fasta( input_fofn, input_fasta, exon_fofn, genomic_reference, cDNA_reference ): """ Pick the top N Amplicon Analysis consensus seqs from a Fasta by Nreads """ # First we align the sequences to the reference and annotate typing raw_alignment = align_best_reference( input_fasta, genomic_reference ) reoriented = orient_fasta( input_fasta, alignment_file=raw_alignment ) selected = extract_alleles( reoriented, alignment_file=raw_alignment ) gDNA_alignment = full_align_best_reference( selected, genomic_reference ) cDNA_file = extract_cDNA( selected, exon_fofn, alignment_file=gDNA_alignment ) cDNA_alignment = align_by_identity( cDNA_file, cDNA_reference ) summarize_typing( gDNA_alignment, cDNA_alignment ) # Next we generate some mock chimera sequences chimera_file = create_chimeras( selected, alignment_file=gDNA_alignment ) basename = '.'.join( chimera_file.split('.')[:-2] ) combined_file = '%s.combined.fasta' % basename combine_fasta( [input_fasta, chimera_file], combined_file ) # Finally we use a competetive alignment of best-reads to summarize the allelic breakdown dirname = os.path.dirname( input_fasta ) best_reads = os.path.join( dirname, 'reads_of_insert.fasta' ) extract_best_reads( input_fofn, best_reads ) best_alignment = align_best_reference( best_reads, combined_file ) summarize_alleles( best_alignment, raw_alignment, selected )
def type_sequences( input_folder, exon_fofn, genomic_reference, cDNA_reference ): """ Pick the top N Amplicon Analysis consensus seqs from a Fasta by Nreads """ sequence_file = os.path.join( input_folder, 'amplicon_analysis.fastq' ) csv_file = os.path.join( input_folder, 'amplicon_analysis.csv' ) # First we align the sequences to the reference and annotate typing raw_alignment = align_best_reference( sequence_file, genomic_reference ) reoriented = orient_sequences( sequence_file, alignment_file=raw_alignment ) reoriented_csv = orient_amp_analysis( csv_file, raw_alignment ) selected = extract_alleles( reoriented, alignment_file=raw_alignment ) selected_csv = subset_amp_analysis( reoriented_csv, selected ) gDNA_alignment = full_align_best_reference( selected, genomic_reference ) cDNA_file = extract_cDNA( selected, exon_fofn, alignment_file=gDNA_alignment ) cDNA_alignment = align_by_identity( cDNA_file, cDNA_reference ) summarize_typing( gDNA_alignment, cDNA_alignment )
def type_sequences(input_folder, exon_fofn, genomic_reference, cDNA_reference): """ Pick the top N Amplicon Analysis consensus seqs from a Fasta by Nreads """ sequence_file = os.path.join(input_folder, 'amplicon_analysis.fastq') csv_file = os.path.join(input_folder, 'amplicon_analysis.csv') # First we align the sequences to the reference and annotate typing raw_alignment = align_best_reference(sequence_file, genomic_reference) reoriented = orient_sequences(sequence_file, alignment_file=raw_alignment) reoriented_csv = orient_amp_analysis(csv_file, raw_alignment) selected = extract_alleles(reoriented, alignment_file=raw_alignment) selected_csv = subset_amp_analysis(reoriented_csv, selected) gDNA_alignment = full_align_best_reference(selected, genomic_reference) cDNA_file = extract_cDNA(selected, exon_fofn, alignment_file=gDNA_alignment) cDNA_alignment = align_by_identity(cDNA_file, cDNA_reference) summarize_typing(gDNA_alignment, cDNA_alignment)
def create_chimeras( input_file, output=None, reference_file=None, alignment_file=None ): """ Pick the top N Amplicon Analysis consensus seqs from a Fasta by Nreads """ # Check the input files, and align the input file if needed if reference_file and alignment_file is None: alignment_file = align_best_reference( input_file, reference_file ) elif reference_file is None and alignment_file is None: msg = "extract_alleles requires either an Alignment or a Reference!" log.error( msg ) raise IOError( msg ) # Set the output file if not specified if output is None: basename = '.'.join( input_file.split('.')[:-1] ) output = '%s.chimeras.fasta' % basename # Parse the alignment data and extract the target sequences alignments = list( BlasrReader( alignment_file )) groups = _group_by_locus( alignments ) groups = _filter_groups( groups ) sequences = list( FastaReader( input_file )) chimeras = list( _create_chimeras( groups, sequences )) write_fasta( chimeras, output ) return output