def create_m5_reference(m5_file): log.info('Parsing Blasr M5 results from "{0}"'.format(m5_file)) results = {} diffs = {} for record in BlasrReader(m5_file): qname = get_base_sequence_name(record.qname) tname = get_base_sequence_name(record.tname) diff_count = int(record.nmis) + int(record.nins) + int(record.ndel) if qname not in diffs: results[qname] = tname diffs[qname] = diff_count elif diffs[qname] > diff_count: results[qname] = tname diffs[qname] = diff_count log.info('Finished reading Blasr results') return results
def create_m1_reference( m1_file, reference=None ): log.info('Parsing Blasr M1 results from "{0}"'.format( m1_file )) results = {} for record in BlasrReader( m1_file ): qname = get_base_sequence_name( record.qname ) tname = get_base_sequence_name( record.tname ) if qname in results: msg = 'Duplicate sequence ids found! "{0}"'.format( qname ) log.info( msg ) raise KeyError( msg ) if reference: results[qname] = reference[tname] else: results[qname] = tname log.info('Finished reading Blasr results') return results
def create_m5_reference( m5_file ): log.info('Parsing Blasr M5 results from "{0}"'.format( m5_file )) results = {} diffs = {} for record in BlasrReader( m5_file ): qname = get_base_sequence_name( record.qname ) tname = get_base_sequence_name( record.tname ) diff_count = int(record.nmis) + int(record.nins) + int(record.ndel) if qname not in diffs: results[qname] = tname diffs[qname] = diff_count elif diffs[qname] > diff_count: results[qname] = tname diffs[qname] = diff_count log.info('Finished reading Blasr results') return results
def create_m1_reference(m1_file, reference=None): log.info('Parsing Blasr M1 results from "{0}"'.format(m1_file)) results = {} for record in BlasrReader(m1_file): qname = get_base_sequence_name(record.qname) tname = get_base_sequence_name(record.tname) if qname in results: msg = 'Duplicate sequence ids found! "{0}"'.format(qname) log.info(msg) raise KeyError(msg) if reference: results[qname] = reference[tname] else: results[qname] = tname log.info('Finished reading Blasr results') return results
def append_typing_results(summary_file, combined_typings, output_file): typing_header = '\tGenType\tGenPctId\tExonType\tExonPctId\tType\t' with open(output_file, 'w') as output: with open(summary_file, 'r') as handle: header = handle.next().strip() output.write(header + typing_header) for line in handle: parts = line.strip().split() name = get_base_sequence_name(parts[1]) parts += combined_typings[name] output.write('\t'.join(parts) + '\n')
def _parse_blasr_alignment( blasr_file ): results = {} for entry in BlasrReader( blasr_file ): name = get_base_sequence_name( entry.qname ) if isinstance(entry, BlasrM1): results[name] = [entry.tname, entry.pctsimilarity] elif isinstance(entry, BlasrM5): diffs = int(entry.nmis) + int(entry.nins) + int(entry.ndel) pctid = 100 * int(entry.nmat) / float(int(entry.nmat) + diffs) results[name] = [entry.tname, pctid] return results
def _parse_blasr_alignment(blasr_file): results = {} for entry in BlasrReader(blasr_file): name = get_base_sequence_name(entry.qname) if isinstance(entry, BlasrM1): results[name] = [entry.tname, entry.pctsimilarity] elif isinstance(entry, BlasrM5): diffs = int(entry.nmis) + int(entry.nins) + int(entry.ndel) pctid = 100 * int(entry.nmat) / float(int(entry.nmat) + diffs) results[name] = [entry.tname, pctid] return results
def append_typing_results( summary_file, combined_typings, output_file): typing_header = '\tGenType\tGenPctId\tExonType\tExonPctId\tType\t' with open(output_file, 'w') as output: with open(summary_file, 'r') as handle: header = handle.next().strip() output.write(header + typing_header) for line in handle: parts = line.strip().split() name = get_base_sequence_name( parts[1] ) parts += combined_typings[name] output.write('\t'.join(parts) + '\n')
def parse_typing( typing_file ): results = {} with open( typing_file) as handle: for line in handle: if line.startswith('Locus'): continue parts = line.strip().split() name = get_base_sequence_name( parts[1] ) typing = parts[2] pctid = parts[5] results[name] = [typing, pctid] return results
def separate_listed_sequences( fasta_file, good_values, good_output, bad_output ): """ Separate a fasta file into two based on a supplied value list """ with FastaWriter( good_output ) as good_handle: with FastaWriter( bad_output ) as bad_handle: for record in FastaReader( fasta_file ): name = get_base_sequence_name( record.name ) if name in good_values: good_handle.writeRecord( record ) else: bad_handle.writeRecord( record )
def parse_typing(typing_file): results = {} with open(typing_file) as handle: for line in handle: if line.startswith('Locus'): continue parts = line.strip().split() name = get_base_sequence_name(parts[1]) typing = parts[2] pctid = parts[5] results[name] = [typing, pctid] return results
def separate_aligned_sequences( fasta_file, dictionary, good_values, good_output, bad_output ): """ Separate a fasta file into two based on a supplied dictionary and value list """ with FastaWriter( good_output ) as good_handle: with FastaWriter( bad_output ) as bad_handle: for record in FastaReader( fasta_file ): name = get_base_sequence_name( record.name ) value = dictionary.get(name, "Unmapped") if value in good_values: good_handle.writeRecord( record ) else: bad_handle.writeRecord( record )
def separate_listed_sequences(fasta_file, good_values, good_output, bad_output): """ Separate a fasta file into two based on a supplied value list """ with FastaWriter(good_output) as good_handle: with FastaWriter(bad_output) as bad_handle: for record in FastaReader(fasta_file): name = get_base_sequence_name(record.name) if name in good_values: good_handle.writeRecord(record) else: bad_handle.writeRecord(record)
def separate_aligned_sequences(fasta_file, dictionary, good_values, good_output, bad_output): """ Separate a fasta file into two based on a supplied dictionary and value list """ with FastaWriter(good_output) as good_handle: with FastaWriter(bad_output) as bad_handle: for record in FastaReader(fasta_file): name = get_base_sequence_name(record.name) value = dictionary.get(name, "Unmapped") if value in good_values: good_handle.writeRecord(record) else: bad_handle.writeRecord(record)
def create_sam_reference( sam_file, reference=None ): log.info('Parsing SAM alignments from "{0}"'.format(sam_file)) results = {} for record in SamReader(sam_file): name = get_base_sequence_name( record.rname ) if record.qname in results: msg = 'Duplicate sequence ids found! "{0}"'.format( record.qname ) log.info( msg ) raise KeyError( msg ) if reference: results[record.qname] = reference[name] else: results[record.qname] = name log.info('Finished reading SAM file results') return results
def create_sam_reference(sam_file, reference=None): log.info('Parsing SAM alignments from "{0}"'.format(sam_file)) results = {} for record in SamReader(sam_file): name = get_base_sequence_name(record.rname) if record.qname in results: msg = 'Duplicate sequence ids found! "{0}"'.format(record.qname) log.info(msg) raise KeyError(msg) if reference: results[record.qname] = reference[name] else: results[record.qname] = name log.info('Finished reading SAM file results') return results
def separate_sequences( fasta_file, dictionary, prefix='' ): """ Separate a fasta file into multiple groups based on some dict """ file_handles = {} for record in FastaReader( fasta_file ): name = get_base_sequence_name( record.name ) group = dictionary.get( name, "Unmapped" ) group_file = prefix + '_' + group + '.fasta' try: file_handles[group_file].writeRecord( record ) except KeyError: file_handles[group_file] = FastaWriter( group_file ) file_handles[group_file].writeRecord( record ) return closed_file_handles( file_handles )
def separate_sequences(fasta_file, dictionary, prefix=''): """ Separate a fasta file into multiple groups based on some dict """ file_handles = {} for record in FastaReader(fasta_file): name = get_base_sequence_name(record.name) group = dictionary.get(name, "Unmapped") group_file = prefix + '_' + group + '.fasta' try: file_handles[group_file].writeRecord(record) except KeyError: file_handles[group_file] = FastaWriter(group_file) file_handles[group_file].writeRecord(record) return closed_file_handles(file_handles)