def orient_sequences(input_file, reference_file=None, alignment_file=None, output_file=None): """ Reorient a fasta file so all sequences are in the same direction as their reference """ log.info( "Reorienting all sequences in %s to the direction of their reference" % input_file) # Set the output file and type output_file = output_file or _get_output_file(input_file) output_type = _get_output_type(output_file) if valid_file(output_file): log.info("Found existing output file %s, skipping orientation step" % output_file) return output_file # Check the input files, and align the input file if needed alignment_file = get_alignment_file(input_file, reference_file, alignment_file) reversed_seqs = _identify_reversed_sequences(alignment_file) log.info("Identified %s sequences needing Reverse Complementation" % len(reversed_seqs)) input_records = _parse_input_records(input_file) reversed_records = _reverse_records(input_records, reversed_seqs) log.info("Writing out sequences to %s" % output_file) _write_output(reversed_records, output_file, output_type) return output_file
def get_cDNA_reference(): if valid_file( CDNA_REF ): logging.info('Using existing cDNA Reference fasta') return CDNA_REF else: logging.info('Creating new cDNA Reference fasta') create_cDNA_reference() return get_cDNA_reference()
def get_genomic_reference(): if valid_file( GEN_REF ): logging.info('Using existing Genomic Reference fasta') return GEN_REF else: logging.info('Creating new Genomic Reference fasta') create_genomic_reference() return get_genomic_reference()
def get_exon_reference(): if valid_file( EXON_REF ): logging.info('Using existing Exon Reference fofn') return EXON_REF else: logging.info('Creating new Exon Reference fofn') create_exon_fofns() create_exon_reference() return get_exon_reference()
def sort_subreads( subread_file, reference_file ): """ Aligning """ log.info("Aligning subreads to the two best references") temp = 'temp2.m1' if valid_file( temp ): return {hit.qname: hit.tname for hit in BlasrReader(temp)} align_best_reference( subread_file, reference_file, temp ) return {hit.qname: hit.tname for hit in BlasrReader(temp)}
def order_references( subread_file, reference_file ): """ Select the two best reference sequences from a list """ log.info("Selecting the best references sequences to use") temp = 'temp.m1' if not valid_file( temp ): align_best_reference( subread_file, reference_file, temp ) c = Counter([hit.tname for hit in BlasrReader(temp)]) return [k for k, v in c.most_common()]
def sort_subreads(subread_file, reference_file): """ Aligning """ log.info("Aligning subreads to the two best references") temp = 'temp2.m1' if valid_file(temp): return {hit.qname: hit.tname for hit in BlasrReader(temp)} align_best_reference(subread_file, reference_file, temp) return {hit.qname: hit.tname for hit in BlasrReader(temp)}
def order_references(subread_file, reference_file): """ Select the two best reference sequences from a list """ log.info("Selecting the best references sequences to use") temp = 'temp.m1' if not valid_file(temp): align_best_reference(subread_file, reference_file, temp) c = Counter([hit.tname for hit in BlasrReader(temp)]) return [k for k, v in c.most_common()]
def _align_sequences(query, reference): """ Align one fasta file of sequences to another """ temp = NamedTemporaryFile(suffix='.m1', delete=False) align_best_reference(query, reference, output=temp.name) if valid_file(temp.name): hits = list(BlasrReader(temp.name)) os.unlink(temp.name) return hits os.unlink(temp.name) return None
def get_input_file( input ): if os.path.isdir( input ): log.info("Input appears to be a directory, looking for sequence files") return get_amplicon_analysis_output( input ) elif valid_file( input ): if is_fasta( input ): log.info("Input appears to be a valid Fasta file") return input elif is_fastq( input ): log.info("Input appears to be a valid Fastq file") return input else: msg = "Input is not a valid Fasta or Fastq file!" log.error( msg ) raise IOError( msg ) else: msg = "Supplied input does not appear to be a valid AmpliconAnalysis file or directory" log.error( msg ) raise IOError( msg )
def get_input_file(input): if os.path.isdir(input): log.info("Input appears to be a directory, looking for sequence files") return get_amplicon_analysis_output(input) elif valid_file(input): if is_fasta(input): log.info("Input appears to be a valid Fasta file") return input elif is_fastq(input): log.info("Input appears to be a valid Fastq file") return input else: msg = "Input is not a valid Fasta or Fastq file!" log.error(msg) raise IOError(msg) else: msg = "Supplied input does not appear to be a valid AmpliconAnalysis file or directory" log.error(msg) raise IOError(msg)
def align_best_reference(query, reference, output=None): """ Align the output of AA to the references and return """ output = _get_output_file(query, output, 'm1') # Run Blasr ref_count = fasta_size(reference) log.info("Aligning %s sequences to %s references" % (query, ref_count)) blasr_args = {'nproc': nproc, 'out': output, 'bestn': 1, 'nCandidates': ref_count, 'noSplitSubreads': True} if reference_has_index( reference ): blasr_args['sa'] = reference + '.sa' run_blasr(query, reference, blasr_args) # Check the output file if valid_file( output ): return output return None
def orient_sequences( input_file, reference_file=None, alignment_file=None, output_file=None ): """ Reorient a fasta file so all sequences are in the same direction as their reference """ log.info("Reorienting all sequences in %s to the direction of their reference" % input_file) # Set the output file and type output_file = output_file or _get_output_file( input_file ) output_type = _get_output_type( output_file ) if valid_file( output_file ): log.info("Found existing output file %s, skipping orientation step" % output_file) return output_file # Check the input files, and align the input file if needed alignment_file = get_alignment_file( input_file, reference_file, alignment_file ) reversed_seqs = _identify_reversed_sequences( alignment_file ) log.info("Identified %s sequences needing Reverse Complementation" % len(reversed_seqs)) input_records = _parse_input_records( input_file ) reversed_records = _reverse_records( input_records, reversed_seqs ) log.info("Writing out sequences to %s" % output_file) _write_output( reversed_records, output_file, output_type ) return output_file
def reference_has_index( reference ): index_file = reference + '.sa' if valid_file( index_file ): return index_file return False