def extract_alleles(input_file, output_file=None, reference_file=None, alignment_file=None, method=METHOD, sort=SORT, loci=LOCI): """Pick the top 2 Amplicon Analysis consensus seqs per group from a Fasta""" method = method or METHOD loci = loci or LOCI # Set the output file if not specified output_file = output_file or _get_output_file(input_file) output_type = get_file_type(output_file) # If align to reference for breaking ties alignment_file = get_alignment_file(input_file, reference_file, alignment_file) alignments = list(BlasrReader(alignment_file)) # Run the appropriate grouping if method == 'locus': groups = _group_by_locus(alignments, loci) elif method == 'barcode': groups = _group_by_barcode(alignments) elif method == 'both': groups = _group_by_both(alignments, loci) elif method == 'all': groups = {a.qname: [a] for a in alignments} else: msg = "Invalid Selection Metric: %s" % method log.error(msg) raise ValueError(msg) # Read the input sequences and use them to generate our sorting data sequences = read_sequences(input_file) if sort == 'num_reads': sorting_data = {s.name: consensus_size(s) for s in sequences} elif sort == 'accuracy': assert get_file_type(input_file) == 'fastq' sorting_data = {s.name: record_accuracy(s) for s in sequences} else: msg = "Invalid Sorting Metric: %s" % sort log.error(msg) raise ValueError(msg) log.info('Sorting sequences for selection according to "%s"' % sort) ordered = _sort_groups(groups, sorting_data) log.info('Selecting top sequences from %s according to the "%s" policy' % (input_file, method)) selected = list(_select_sequences(ordered)) log.info('Selected %s sequences from %s total for further analysis' % (len(selected), len(sequences))) log.info('Writing the selected sequences out to %s' % output_file) subset = list(_subset_sequences(sequences, selected)) _write_output(subset, output_file, output_type) return output_file
def extract_alleles( input_file, output_file=None, reference_file=None, alignment_file=None, method=METHOD, sort=SORT, loci=LOCI ): """Pick the top 2 Amplicon Analysis consensus seqs per group from a Fasta""" method = method or METHOD loci = loci or LOCI # Set the output file if not specified output_file = output_file or _get_output_file( input_file ) output_type = get_file_type( output_file ) # If align to reference for breaking ties alignment_file = get_alignment_file( input_file, reference_file, alignment_file ) alignments = list( BlasrReader( alignment_file )) # Run the appropriate grouping if method == 'locus': groups = _group_by_locus( alignments, loci ) elif method == 'barcode': groups = _group_by_barcode( alignments ) elif method == 'both': groups = _group_by_both( alignments, loci ) elif method == 'all': groups = {a.qname: [a] for a in alignments} else: msg = "Invalid Selection Metric: %s" % method log.error( msg ) raise ValueError( msg ) # Read the input sequences and use them to generate our sorting data sequences = read_sequences( input_file ) if sort == 'num_reads': sorting_data = {s.name: consensus_size(s) for s in sequences} elif sort == 'accuracy': assert get_file_type(input_file) == 'fastq' sorting_data = {s.name: record_accuracy(s) for s in sequences} else: msg = "Invalid Sorting Metric: %s" % sort log.error( msg ) raise ValueError( msg ) log.info('Sorting sequences for selection according to "%s"' % sort) ordered = _sort_groups( groups, sorting_data ) log.info('Selecting top sequences from %s according to the "%s" policy' % (input_file, method)) selected = list( _select_sequences( ordered )) log.info('Selected %s sequences from %s total for further analysis' % (len(selected), len(sequences))) log.info('Writing the selected sequences out to %s' % output_file) subset = list( _subset_sequences( sequences, selected )) _write_output( subset, output_file, output_type ) return output_file
def get_output_file( input_file, modifier ): """ Get a modified output file name based on some input file """ basename = '.'.join( input_file.split('.')[:-1] ) file_type = get_file_type( input_file ) return '%s.%s.%s' % (basename, modifier, file_type)
def _get_output_file(input_file): """ Get the output file, either as provided or from the input filename """ basename = '.'.join(input_file.split('.')[:-1]) input_type = get_file_type(input_file) return '%s.oriented.%s' % (basename, input_type)
def get_output_file(input_file, modifier): """ Get a modified output file name based on some input file """ basename = '.'.join(input_file.split('.')[:-1]) file_type = get_file_type(input_file) return '%s.%s.%s' % (basename, modifier, file_type)
def _get_output_file( input_file ): """ Get the output file, either as provided or from the input filename """ basename = '.'.join( input_file.split('.')[:-1] ) input_type = get_file_type( input_file ) return '%s.oriented.%s' % (basename, input_type)
def _get_output_type( output_file ): """ Get the output filetype and confirm the format is valid """ output_type = get_file_type( output_file ) if output_type in ['fasta', 'fastq']: return output_type else: msg = "Output file must be either Fasta or Fastq format" log.error( msg ) raise TypeError( msg )
def _get_output_type(output_file): """ Get the output filetype and confirm the format is valid """ output_type = get_file_type(output_file) if output_type in ['fasta', 'fastq']: return output_type else: msg = "Output file must be either Fasta or Fastq format" log.error(msg) raise TypeError(msg)
def _parse_input_records( input_file ): """ Parse the input sequence records with the appropriate pbcore Reader """ input_type = get_file_type( input_file ) if input_type == 'fasta': return list( FastaReader( input_file )) elif input_type == 'fastq': return list( FastqReader( input_file )) else: msg = 'Input file must be either Fasta or Fastq' log.error( msg )
def exons_to_cDNA( exon_file ): """ Combine a multi-Fasta of Exon sequences into a mock cDNA """ output_type = get_file_type( exon_file ) output_file = _get_output_file( exon_file, output_type ) records = _parse_exon_records( exon_file, output_type ) log.info("Combinging %s exons sequences to cDNA" % len(records)) if len( records ): sorted_records = _sort_records( records ) cDNA_record = _combine_records( sorted_records ) log.info("Writing cDNA sequence out to %s" % output_file) write_sequences( cDNA_record, output_file )
def exons_to_cDNA(exon_file): """ Combine a multi-Fasta of Exon sequences into a mock cDNA """ output_type = get_file_type(exon_file) output_file = _get_output_file(exon_file, output_type) records = _parse_exon_records(exon_file, output_type) log.info("Combinging %s exons sequences to cDNA" % len(records)) if len(records): sorted_records = _sort_records(records) cDNA_record = _combine_records(sorted_records) log.info("Writing cDNA sequence out to %s" % output_file) write_sequences(cDNA_record, output_file)
def extract_alleles( input_file, min_reads, min_length, output_file=None ): """ Pick the top N Amplicon Analysis consensus seqs from a Fasta by Nreads """ # Set the output file if not specified output_file = output_file or _get_output_file( input_file ) output_type = get_file_type( output_file ) # Parse the alignment data and extract the target sequences sequences = _parse_input_records( input_file ) sequences = _filter_on_length( sequences, min_length ) sequences = _filter_on_numreads( sequences, min_reads ) _write_output( sequences, output_file, output_type ) return output_file
def extract_exons(input_record, exon_fofn, directory=None): """ Extract all exons from a particular Fasta File into a separate Fasta File """ if isinstance(input_record, str): output_type = get_file_type(input_record) input_record = _read_fasta_record(input_record) elif isinstance(input_record, FastaRecord): output_type = 'fasta' elif isinstance(input_record, FastqRecord): output_type = 'fastq' else: msg = 'Input record must be Filename, FastaRecord or FastqRecord' log.error(msg) raise TypeError(msg) return _extract_exons(input_record, exon_fofn, output_type, directory)
def extract_exons( input_record, exon_fofn, directory=None ): """ Extract all exons from a particular Fasta File into a separate Fasta File """ if isinstance( input_record, str ): output_type = get_file_type( input_record ) input_record = _read_fasta_record( input_record ) elif isinstance( input_record, FastaRecord ): output_type = 'fasta' elif isinstance( input_record, FastqRecord ): output_type = 'fastq' else: msg = 'Input record must be Filename, FastaRecord or FastqRecord' log.error( msg ) raise TypeError( msg ) return _extract_exons( input_record, exon_fofn, output_type, directory )
def trim_alleles(input_file, output_file=None, trim=0): """Pick the top 2 Amplicon Analysis consensus seqs per group from a Fasta""" # If no trim or output file is specified, we can skip this module if trim == 0 and output_file is None: log.info('No trimming necessary for "%s", skipping...' % input_file) return input_file # Set the output file if not specified output_file = output_file or _get_output_file(input_file) output_type = get_file_type(output_file) # Read the input sequences and trim each record sequences = read_sequences(input_file) log.info("Trimming sequences by %s bp from each end" % trim) trimmed = _trim_sequences(sequences, trim) log.info("Writing the trimmed sequences out to %s" % output_file) _write_output(trimmed, output_file, output_type) return output_file
def trim_alleles(input_file, output_file=None, trim=0): """Pick the top 2 Amplicon Analysis consensus seqs per group from a Fasta""" # If no trim or output file is specified, we can skip this module if trim == 0 and output_file is None: log.info('No trimming necessary for "%s", skipping...' % input_file) return input_file # Set the output file if not specified output_file = output_file or _get_output_file(input_file) output_type = get_file_type(output_file) # Read the input sequences and trim each record sequences = read_sequences(input_file) log.info('Trimming sequences by %s bp from each end' % trim) trimmed = _trim_sequences(sequences, trim) log.info('Writing the trimmed sequences out to %s' % output_file) _write_output(trimmed, output_file, output_type) return output_file
def _get_output_file( input_file ): basename = '.'.join( input_file.split('.')[:-1] ) file_type = get_file_type( input_file ) return '%s.filtered.%s' % (basename, file_type)
def _get_output_file( input_file ): basename = '.'.join( input_file.split('.')[:-1] ) file_type = get_file_type( input_file ) return '%s.selected.%s' % (basename, file_type)
def _get_output_file(input_file): basename = ".".join(input_file.split(".")[:-1]) file_type = get_file_type(input_file) return "%s.trimmed.%s" % (basename, file_type)