def write_amp_analysis_records(records, filename): log.info("Writing {0} AmpAnalysisRecords to {1}".format( len(records), filename)) with AmpliconAnalysisWriter(filename) as handle: for record in records: handle.write_fasta(record) check_output_file(filename)
def align_by_identity(query, reference_fasta, output=None, format='1'): """ Type sequences in a fasta file by finding the closet reference """ # If output isn't specified, base it on the query assert format in ['1', '5'] if output is None: basename = '.'.join(query.split('.')[:-1]) output = '%s.m%s' % (basename, format) ref_count = fasta_size(reference_fasta) # Iterate over each Fasta, aligning individually. with BlasrWriter(output) as handle: handle.write_header('m1') for record in read_sequences(query): log.info('Aligning %s by identity to %s references' % (record.name, ref_count)) temp = write_temp_fasta(record) alignments = _align_fasta(temp.name, reference_fasta, format) if not alignments: log.info("No hits found for %s" % record.name) continue alignments = _sort_alignments(alignments) alignments = _filter_alignments(alignments) log.info( 'Found %s alignments sharing maximum identity with the query' % len(alignments)) handle.write(alignments[0]) os.unlink(temp.name) check_output_file(output) return output
def _extract_exons( record, exon_fofn, output_type, directory ): """ Extract Fasta Records of each Exon from a Fasta Record """ log.info('Extracting exons from "%s"' % record.name) temp_fasta = _write_temp_fasta( record ) output_file = os.path.join( directory, 'all_exons.%s' % output_type ) output_handle = _open_output_handle( output_file, output_type ) # Iterate over the individual Exon Fasta files looking for alignments exon_count = 0 for exon_fasta in read_list_file( exon_fofn ): exon_num = exon_fasta[-7] start, end = _find_exon_position( temp_fasta, exon_fasta, directory ) if start is None or end is None: continue exon_count += 1 exon_record = _extract_exon_record( record, exon_num, start, end ) output_handle.writeRecord( exon_record ) os.unlink( temp_fasta ) output_handle.close() if exon_count: log.info("Extracted %s exons from %s" % (exon_count, record.name)) else: log.warn("No valid exons found for %s!" % record.name) return None check_output_file( output_file ) return output_file
def _extract_exons(record, exon_fofn, output_type, directory): """ Extract Fasta Records of each Exon from a Fasta Record """ log.info('Extracting exons from "%s"' % record.name) temp_fasta = _write_temp_fasta(record) output_file = os.path.join(directory, 'all_exons.%s' % output_type) output_handle = _open_output_handle(output_file, output_type) # Iterate over the individual Exon Fasta files looking for alignments exon_count = 0 for exon_fasta in read_list_file(exon_fofn): exon_num = exon_fasta[-7] start, end = _find_exon_position(temp_fasta, exon_fasta, directory) if start is None or end is None: continue exon_count += 1 exon_record = _extract_exon_record(record, exon_num, start, end) output_handle.writeRecord(exon_record) os.unlink(temp_fasta) output_handle.close() if exon_count: log.info("Extracted %s exons from %s" % (exon_count, record.name)) else: log.warn("No valid exons found for %s!" % record.name) return None check_output_file(output_file) return output_file
def align_by_identity( query, reference_fasta, output=None, format='1' ): """ Type sequences in a fasta file by finding the closet reference """ # If output isn't specified, base it on the query assert format in ['1', '5'] if output is None: basename = '.'.join( query.split('.')[:-1] ) output = '%s.m%s' % (basename, format) ref_count = fasta_size(reference_fasta) # Iterate over each Fasta, aligning individually. with BlasrWriter( output ) as handle: handle.write_header( 'm1' ) for record in read_sequences( query ): log.info('Aligning %s by identity to %s references' % (record.name, ref_count)) temp = write_temp_fasta( record ) alignments = _align_fasta( temp.name, reference_fasta, format ) if not alignments: log.info("No hits found for %s" % record.name) continue alignments = _sort_alignments( alignments ) alignments = _filter_alignments( alignments ) log.info('Found %s alignments sharing maximum identity with the query' % len(alignments)) handle.write( alignments[0] ) os.unlink( temp.name ) check_output_file( output ) return output
def align_reference_to_contigs( locus, reference, contig_file ): output_file = 'HLA-%s.m1' % locus blasr_args = {'nproc': 8, 'out': output_file, 'bestn': 1, 'noSplitSubreads': True} run_blasr( reference, contig_file, blasr_args ) check_output_file( output_file ) return output_file
def combine_fasta( fasta_files, destination ): with FastaWriter( destination ) as handle: for fasta in fasta_files: try: for record in FastaReader( fasta ): handle.writeRecord( record ) except: log.warn('Could not open "%s" as Fasta' % fasta) check_output_file( destination )
def _write_output(records, output_file, output_type): """Write the records out to file""" if output_type == 'fasta': write_fasta(records, output_file) else: with FastqWriter(output_file) as writer: for record in records: writer.writeRecord(record) check_output_file(output_file)
def combine_fasta(fasta_files, destination): with FastaWriter(destination) as handle: for fasta in fasta_files: try: for record in FastaReader(fasta): handle.writeRecord(record) except: log.warn('Could not open "%s" as Fasta' % fasta) check_output_file(destination)
def _write_output( records, output_file, output_type ): """Write the records out to file""" if output_type == 'fasta': write_fasta( records, output_file ) else: with FastqWriter( output_file ) as writer: for record in records: writer.writeRecord( record ) check_output_file( output_file )
def write_fastq( records, output_file ): """ Write a FastqRecord, or a list of FastqRecords, out to file """ with FastqWriter( output_file ) as handle: for record in records: assert isinstance( record, FastqRecord ) handle.writeRecord( record ) check_output_file( output_file ) return output_file
def write_fastq(records, output_file): """ Write a FastqRecord, or a list of FastqRecords, out to file """ with FastqWriter(output_file) as handle: for record in records: assert isinstance(record, FastqRecord) handle.writeRecord(record) check_output_file(output_file) return output_file
def multi_sequence_alignment(input_file, output=None): """ Align the output of AA to the references and return """ # Figure out the output and remove it if it exists output = _get_output_file(input_file, output, 'afa') # Run Muscle muscle_args = {'in': input_file, 'out': output} run_muscle( muscle_args ) # Check the output file check_output_file( output ) return output
def combine_fastq( sequence_files, output_file ): """ Combine a series of sequence files into one Fastq """ with FastqWriter( output_file ) as handle: for filename in sequence_files: try: for record in FastqReader( filename ): handle.writeRecord( record ) except: log.warn('Could not open "%s" as Fastq' % fasta) check_output_file( output_file ) return output_file
def extract_whitelist_reads( self, white_list_ids ): """ Convert a White List of Ids into a White List of Sequences """ root = '.'.join( white_list_ids.split('.')[:-1] ) white_list_seqs = '%s.fasta' % root extract_subreads( self._input_file, white_list_seqs, self._min_read_length, self._min_read_score, white_list_ids ) check_output_file( white_list_seqs ) return white_list_seqs
def combine_fasta(sequence_files, output_file): """ Combine a series of sequence files into one Fasta """ with FastaWriter(output_file) as handle: for filename in sequence_files: try: for record in FastaReader(filename): handle.writeRecord(record) except: log.warn('Could not open "%s" as Fasta' % fasta) check_output_file(output_file) return output_file
def summarize_typing( gDNA_align, cDNA_align, output=None ): """ Pick the top N Amplicon Analysis consensus seqs from a Fasta by Nreads """ if output is None: basename = '.'.join( gDNA_align.split('.')[:-1] ) output = '%s.typing' % basename log.info("Writing summary of typing information to %s" % output) gDNA_data = _parse_alignment( gDNA_align ) cDNA_data = _parse_alignment( cDNA_align ) summaries = _summarize_hits( gDNA_data, cDNA_data ) _write_type_summary( summaries, output ) check_output_file( output ) return output
def _collect_cDNA( folder, output_file ): """ Collect all of the cDNA sequences into one Fasta File """ cDNA_files = [] for entry in os.listdir( folder ): entry_path = os.path.join( folder, entry ) if os.path.isdir( entry_path ): for filename in os.listdir( entry_path ): if filename.endswith('cDNA.fasta') or filename.endswith('cDNA.fastq'): filepath = os.path.join( entry_path, filename ) cDNA_files.append( filepath ) combine_sequences( cDNA_files, output_file ) check_output_file( output_file )
def read_log_data(folder): chimera_scores = {} log_path = os.path.join(folder, "amplicon_analysis.log") check_output_file(log_path) with open(log_path) as handle: for line in handle: line_parts = line.strip().split() consensus = line_parts[2] if "abundant," in line_parts: chimera_scores[consensus] = 0.0 elif "chimera" in line_parts: chimera_scores[consensus] = float(line_parts[8]) return chimera_scores
def combine_sequences(sequence_files, output_file): """ Combine a series of sequence files into one Fasta or Fastq """ if all([is_fasta(f) for f in sequence_files]): combine_fasta(sequence_files, output_file) elif all([is_fastq(f) for f in sequence_files]): combine_fastq(sequence_files, output_file) else: msg = "Input files must be all Fasta or Fastq" log.error(msg) raise TypeError(msg) check_output_file(output_file) return output_file
def rename_fofn(input_fofn, output_fofn, name_key): """ Rename a FOFN of subread files """ with open(output_fofn, "w") as writer: with open(input_fofn, "r") as handle: for line in handle: filename = line.strip() if filename: renamed_file = filename.split(".")[0] + "_renamed.fasta" rename_fasta(filename, renamed_file, name_key) writer.write(renamed_file + "\n") check_output_file(output_fofn) return output_fofn
def rename_fofn( input_fofn, output_fofn, name_key ): """ Rename a FOFN of subread files """ with open( output_fofn, 'r' ) as writer: with open( input_fofn, 'r' ) as handle: for line in handle: filename = line.strip() if filename: renamed_file = filename.split('.')[0] + '_renamed.fasta' rename_fasta( filename, renamed_file, name_key ) writer.write( renamed_file + '\n' ) check_output_file( output_fofn ) return output_fofn
def read_log_data(folder): chimera_scores = {} log_path = os.path.join(folder, 'amplicon_analysis.log') check_output_file(log_path) with open(log_path) as handle: for line in handle: line_parts = line.strip().split() consensus = line_parts[2] if 'abundant,' in line_parts: chimera_scores[consensus] = 0.0 elif 'chimera' in line_parts: chimera_scores[consensus] = float(line_parts[8]) return chimera_scores
def summarize_typing(gDNA_align, cDNA_align, output=None): """ Pick the top N Amplicon Analysis consensus seqs from a Fasta by Nreads """ if output is None: basename = '.'.join(gDNA_align.split('.')[:-1]) output = '%s.typing' % basename log.info("Writing summary of typing information to %s" % output) gDNA_data = _parse_alignment(gDNA_align) cDNA_data = _parse_alignment(cDNA_align) summaries = _summarize_hits(gDNA_data, cDNA_data) _write_type_summary(summaries, output) check_output_file(output) return output
def combine_sequences( sequence_files, output_file ): """ Combine a series of sequence files into one Fasta or Fastq """ if all([is_fasta(f) for f in sequence_files]): combine_fasta( sequence_files, output_file ) elif all([is_fastq(f) for f in sequence_files]): combine_fastq( sequence_files, output_file ) else: msg = "Input files must be all Fasta or Fastq" log.error( msg ) raise TypeError( msg ) check_output_file( output_file ) return output_file
def write_fasta( fasta_records, output_file): """ Write a FastaRecord, or list of records, out to file """ with FastaWriter( output_file ) as handle: if isinstance( fasta_records, FastaRecord ): handle.writeRecord( fasta_records ) elif isinstance( fasta_records, list): for record in fasta_records: handle.writeRecord( record ) else: msg = "Input Record(s) type not recognized" log.error( msg ) raise TypeError( msg ) check_output_file( output_file )
def write_fasta(fasta_records, output_file): """ Write a FastaRecord, or list of records, out to file """ with FastaWriter(output_file) as handle: if isinstance(fasta_records, FastaRecord): handle.writeRecord(fasta_records) elif isinstance(fasta_records, list): for record in fasta_records: handle.writeRecord(record) else: msg = "Input Record(s) type not recognized" log.error(msg) raise TypeError(msg) check_output_file(output_file)
def align_subreads( self, white_list, reference_file ): """ Align the subreads in a Whitelist to the created reference """ basename = '.'.join( reference_file.split('.')[:-1] ) alignment_file = '%s.m1' % basename reference_count = fasta_size( reference_file ) blasr_args = { 'nproc': self._nproc, 'out': alignment_file, 'bestn': 1, 'nCandidates': reference_count, 'noSplitSubreads': True } run_blasr( white_list, reference_file, blasr_args ) check_output_file( alignment_file ) return alignment_file
def separate_alleles( self, white_list ): # Run the first pass, with clustering log.info("Beginning iteration #%s" % self._count) print print self._count, self._output_filelist print curr_output = os.path.join( self._output, 'Iteration_%s' % self._count ) output_file = amp_assem_output_exists( curr_output ) if output_file: log.info('Existing output detected, skipping...') else: log.info('No existing output detected, proceeding ...') if self._count == 0: # For the first pass we enable clustering output_file = self.run_analysis( curr_output, white_list, cluster=True ) else: # For all other iterations, we disable clustering output_file = self.run_analysis( curr_output, white_list, cluster=False ) check_output_file( output_file ) # Outputs of a single Fasta File are returned as is: log.info("Finished iteration #%s" % self._count) self._count += 1 fasta_count = fasta_size( output_file ) if fasta_count == 1: log.info('AmpliconAnalysis generated 1 cluster, exiting...') self.output_filelist.append( output_file ) return log.info('Amplicon Analysis generated %s clusters, continuing splitting' % fasta_count) # Otherwise we partition the reads and run the process on each partition alignment = self.align_subreads( white_list, output_file ) groups = group_subreads( alignment ) output_dir = os.path.dirname( output_file ) sub_lists = [] for reference, group in groups.iteritems(): group_file = '%s.ids' % reference group_path = os.path.join( output_dir, group_file ) write_whitelist( group, group_path ) white_list_seqs = self.extract_whitelist_reads( group_path ) sub_lists.append( white_list_seqs ) if len(group) < MIN_SIZE: log.info('') continue for sub_list in sub_lists: self.separate_alleles( sub_list )
def write_sequences( records, output_file ): """ Write a sequence record, or list of records, out to file """ if isinstance( records, list ) and all([isinstance( r, FastaRecord ) for r in records]): write_fasta( records, output_file ) elif isinstance( records, list ) and all([isinstance( r, FastqRecord ) for r in records]): write_fastq( records, output_file ) elif isinstance( records, FastaRecord ): write_fasta( [records], output_file ) elif isinstance( records, FastqRecord ): write_fastq( [records], output_file ) else: msg = "Input Record(s) type not recognized" log.error( msg ) raise TypeError( msg ) check_output_file( output_file ) return output_file
def rename_fasta( input_file, output_file, name_key ): """ Rename a single Fasta of subreads """ renaming_dict = read_dict_file( name_key ) with FastaWriter( output_file ) as writer: for record in FastaReader( input_file ): old_name = record.name.split()[0] try: new_name = renaming_dict[old_name] except KeyError: msg = "Sequence name not found!" log.error( msg ) raise KeyError( msg ) new_record = FastaRecord( new_name, record.sequence ) writer.writeRecord( new_record ) check_output_file( output_file ) return output_file
def _align_subreads( subread_fasta, reference_fasta, locus ): """ Align all locus-specific subreads against the appropriate references """ location = os.path.dirname( subread_fasta ) alignment_file = os.path.join(location, 'temp.m1') subread_count = fasta_size( subread_fasta ) reference_count = fasta_size( reference_fasta ) blasr_args = {'nproc': 8, 'out': alignment_file, 'bestn': 1, 'nCandidates': reference_count, 'noSplitSubreads': True} log.info("Aligning %s reads against %s references for %s" % (subread_count, reference_count, locus)) run_blasr( subread_fasta, reference_fasta, blasr_args ) check_output_file( alignment_file ) return alignment_file
def write_sequences(records, output_file): """ Write a sequence record, or list of records, out to file """ if isinstance(records, list) and all( [isinstance(r, FastaRecord) for r in records]): write_fasta(records, output_file) elif isinstance(records, list) and all( [isinstance(r, FastqRecord) for r in records]): write_fastq(records, output_file) elif isinstance(records, FastaRecord): write_fasta([records], output_file) elif isinstance(records, FastqRecord): write_fastq([records], output_file) else: msg = "Input Record(s) type not recognized" log.error(msg) raise TypeError(msg) check_output_file(output_file) return output_file
def _align_subreads(subread_fasta, reference_fasta, locus): """ Align all locus-specific subreads against the appropriate references """ location = os.path.dirname(subread_fasta) alignment_file = os.path.join(location, 'temp.m1') subread_count = fasta_size(subread_fasta) reference_count = fasta_size(reference_fasta) blasr_args = { 'nproc': 8, 'out': alignment_file, 'bestn': 1, 'nCandidates': reference_count, 'noSplitSubreads': True } log.info("Aligning %s reads against %s references for %s" % (subread_count, reference_count, locus)) run_blasr(subread_fasta, reference_fasta, blasr_args) check_output_file(alignment_file) return alignment_file
def split_results(amp_analysis): """Split the output of an Amplicon Analysis job by Barcode""" assert os.path.isdir(amp_analysis) sequence_path = os.path.join(amp_analysis, "amplicon_analysis.fasta") check_output_file(sequence_path) print "Analyzing %s output sequences" % fasta_size(sequence_path) barcode_path = os.path.join(amp_analysis, "by_barcode") create_directory(barcode_path) records = list(FastaReader(sequence_path)) barcodes = {get_barcode(r): [] for r in records} [barcodes[get_barcode(r)].append(r) for r in records] barcode_files = {} for barcode, records in barcodes.iteritems(): barcode_file = barcode + ".fasta" sample_path = os.path.join(barcode_path, barcode_file) with FastaWriter(sample_path) as handle: for record in records: handle.writeRecord(record) barcode_files[barcode] = sample_path return barcode_files
def full_align_best_reference(query, reference, output=None): """ Align the output of AA to the references and return """ # Figure out the output and remove it if it exists output = _get_output_file(query, output, 'm5') # Run Blasr ref_count = fasta_size(reference) log.info("Aligning %s sequences to %s references" % (query, ref_count)) blasr_args = {'nproc': nproc, 'out': output, 'm': 5, 'bestn': 1, 'nCandidates': ref_count, 'noSplitSubreads': True} if reference_has_index( reference ): blasr_args['sa'] = reference + '.sa' run_blasr(query, reference, blasr_args) # Check the output file check_output_file(output) return output
def split_results(amp_analysis): """Split the output of an Amplicon Analysis job by Barcode""" assert os.path.isdir(amp_analysis) sequence_path = os.path.join(amp_analysis, 'amplicon_analysis.fasta') check_output_file(sequence_path) print "Analyzing %s output sequences" % fasta_size(sequence_path) barcode_path = os.path.join(amp_analysis, 'by_barcode') create_directory(barcode_path) records = list(FastaReader(sequence_path)) barcodes = {get_barcode(r): [] for r in records} [barcodes[get_barcode(r)].append(r) for r in records] barcode_files = {} for barcode, records in barcodes.iteritems(): barcode_file = barcode + '.fasta' sample_path = os.path.join(barcode_path, barcode_file) with FastaWriter(sample_path) as handle: for record in records: handle.writeRecord(record) barcode_files[barcode] = sample_path return barcode_files
def extract_best_reads(input_file, output_file=None, min_length=MIN_LENGTH, min_score=MIN_SCORE): """ Extract, filter and subset subreads from Bas/Bax/Fofn Files """ if output_file is None: basename = '.'.join( input_file.split('.')[:-1] ) output_file = '%s.best.fasta' % basename log.info('Extracting subreads from %s' % os.path.basename(input_file)) log.debug('\tMinimum Length:\t%s' % min_length) log.debug('\tMinimum Score:\t%s' % min_score) reads = [] for i, filename in enumerate(_iterate_input_files( input_file )): reads += list( _extract_from_bash5( filename, min_length, min_score )) log.info("Extracted %s subreads from %s files" % (len(reads), i+1)) write_fasta( reads, output_file ) check_output_file( output_file ) log.info("Finished extracting subreads") return output_file
def write_amp_analysis_records( records, filename ): log.info("Writing {0} AmpAnalysisRecords to {1}".format(len(records), filename)) with AmpliconAnalysisWriter( filename ) as handle: for record in records: handle.write_fasta( record ) check_output_file( filename )
def write_fastq_records( records, filename ): log.info("Writing {0} FastqRecords to {1}".format(len(records), filename)) with FastqWriter( filename ) as handle: for record in records: handle.writeRecord( record ) check_output_file( filename )
def run_hmmsearch(query, reference, args): command_args = create_hmmsearch_command(query, reference, args) log_command( command_args ) execute_command( command_args ) if 'domtblout' in command_args: check_output_file( command_args['domtblout'] )
def run_hmmsearch(query, reference, args): command_args = create_hmmsearch_command(query, reference, args) log_command(command_args) execute_command(command_args) if "domtblout" in command_args: check_output_file(command_args["domtblout"])
def write_fastq_records(records, filename): log.info("Writing {0} FastqRecords to {1}".format(len(records), filename)) with FastqWriter(filename) as handle: for record in records: handle.writeRecord(record) check_output_file(filename)