def check_command_line_args(options, args, parser): # check command line arguments if len(args) < 3: parser.error("Incorrect number of command line arguments") fastq_files = args[0:2] output_dir = args[2] # check that input fastq files exist read_lengths = [] for mate, fastq_file in enumerate(fastq_files): if not os.path.isfile(args[0]): parser.error("mate '%d' fastq file '%s' is not valid" % (mate, fastq_file)) logging.debug("Checking read length for file %s" % (fastq_file)) read_lengths.append(get_read_length(fastq_file)) logging.debug("Read length for file %s: %d" % (fastq_file, read_lengths[-1])) # check that mate read lengths are equal if len(set(read_lengths)) > 1: parser.error("read lengths mate1=%d and mate2=%d are unequal" % (read_lengths[0], read_lengths[1])) # check that seed length < read length if any(options.segment_length > rlen for rlen in read_lengths): parser.error("seed length %d cannot be longer than read length" % (options.segment_length)) # check that output dir is not a regular file if os.path.exists(output_dir) and (not os.path.isdir(output_dir)): parser.error( "Output directory name '%s' exists and is not a valid directory" % (output_dir)) if check_executable(options.bowtie_build_bin): logging.debug("Checking for 'bowtie-build' binary... found") else: parser.error("bowtie-build binary not found or not executable") # check that bowtie program exists if check_executable(options.bowtie_bin): logging.debug("Checking for 'bowtie' binary... found") else: parser.error("bowtie binary not found or not executable") # check that alignment index exists if os.path.isdir(options.index_dir): logging.debug("Checking for chimerascan index directory... found") else: parser.error("chimerascan alignment index directory '%s' not valid" % (options.index_dir)) # check that alignment index file exists align_index_file = os.path.join(options.index_dir, config.BOWTIE_INDEX_FILE) if os.path.isfile(align_index_file): logging.debug("Checking for bowtie index file... found") else: parser.error("chimerascan bowtie index file '%s' invalid" % (align_index_file)) # check for sufficient processors if options.num_processors < config.BASE_PROCESSORS: logging.warning( "Please specify >=2 processes using '-p' to allow program to run efficiently" )
def check_config(self): # check that input fastq files exist config_passed = True read_lengths = [] for mate,fastq_file in enumerate(self.fastq_files): if not os.path.isfile(fastq_file): logging.error("mate '%d' fastq file '%s' is not valid" % (mate, fastq_file)) config_passed = False read_lengths.append(get_read_length(fastq_file)) logging.debug("Checking file %s" % (fastq_file)) logging.debug("File %s read length=%d" % (fastq_file, read_lengths[-1])) # check that mate read lengths are equal if len(set(read_lengths)) > 1: logging.error("Unequal read lengths mate1=%d and mate2=%d" % (read_lengths[0], read_lengths[1])) config_passed = False # check that seed length < read length if any(self.segment_length > rlen for rlen in read_lengths): logging.error("seed length %d cannot be longer than read length" % (self.segment_length)) config_passed = False # check that output dir is not a regular file if os.path.exists(self.output_dir) and (not os.path.isdir(self.output_dir)): logging.error("Output directory name '%s' exists and is not a valid directory" % (self.output_dir)) config_passed = False if check_executable(self.bowtie_build_bin): logging.debug("Checking for 'bowtie-build' binary... found") else: logging.error("bowtie-build binary not found or not executable") config_passed = False # check that bowtie program exists if check_executable(self.bowtie_bin): logging.debug("Checking for 'bowtie' binary... found") else: logging.error("bowtie binary not found or not executable") config_passed = False # check that alignment index exists if os.path.isdir(self.index_dir): logging.debug("Checking for chimerascan index directory... found") # check that alignment index file exists align_index_file = os.path.join(self.index_dir, config.BOWTIE_INDEX_FILE) if os.path.isfile(align_index_file): logging.debug("Checking for bowtie index file... found") else: logging.error("chimerascan bowtie index file '%s' invalid" % (align_index_file)) config_passed = False else: logging.error("chimerascan alignment index directory '%s' not valid" % (self.index_dir)) config_passed = False # check for sufficient processors if self.num_processors < config.BASE_PROCESSORS: logging.warning("Please specify >=2 processes using '-p' to allow program to run efficiently") return config_passed
def check_command_line_args(options, args, parser): # check command line arguments if len(args) < 3: parser.error("Incorrect number of command line arguments") fastq_files = args[0:2] output_dir = args[2] # check that input fastq files exist read_lengths = [] for mate,fastq_file in enumerate(fastq_files): if not os.path.isfile(args[0]): parser.error("mate '%d' fastq file '%s' is not valid" % (mate, fastq_file)) logging.debug("Checking read length for file %s" % (fastq_file)) read_lengths.append(get_read_length(fastq_file)) logging.debug("Read length for file %s: %d" % (fastq_file, read_lengths[-1])) # check that mate read lengths are equal if len(set(read_lengths)) > 1: parser.error("read lengths mate1=%d and mate2=%d are unequal" % (read_lengths[0], read_lengths[1])) # check that seed length < read length if any(options.segment_length > rlen for rlen in read_lengths): parser.error("seed length %d cannot be longer than read length" % (options.segment_length)) # check that output dir is not a regular file if os.path.exists(output_dir) and (not os.path.isdir(output_dir)): parser.error("Output directory name '%s' exists and is not a valid directory" % (output_dir)) if check_executable(options.bowtie_build_bin): logging.debug("Checking for 'bowtie-build' binary... found") else: parser.error("bowtie-build binary not found or not executable") # check that bowtie program exists if check_executable(options.bowtie_bin): logging.debug("Checking for 'bowtie' binary... found") else: parser.error("bowtie binary not found or not executable") # check that alignment index exists if os.path.isdir(options.index_dir): logging.debug("Checking for chimerascan index directory... found") else: parser.error("chimerascan alignment index directory '%s' not valid" % (options.index_dir)) # check that alignment index file exists align_index_file = os.path.join(options.index_dir, config.BOWTIE_INDEX_FILE) if os.path.isfile(align_index_file): logging.debug("Checking for bowtie index file... found") else: parser.error("chimerascan bowtie index file '%s' invalid" % (align_index_file)) # check for sufficient processors if options.num_processors < config.BASE_PROCESSORS: logging.warning("Please specify >=2 processes using '-p' to allow program to run efficiently")
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser( description="Build alignment indexes for use with chimerascan") parser.add_argument("ref_fasta_file", help="reference genome FASTA file") parser.add_argument("transcript_feature_file", help="transcript features") parser.add_argument("output_dir", help="directory where indexes will be created") args = parser.parse_args() # check that input files exist if not os.path.isfile(args.ref_fasta_file): parser.error("Reference fasta file '%s' not found" % (args.ref_fasta_file)) if not os.path.isfile(args.transcript_feature_file): parser.error("Gene feature file '%s' not found" % (args.transcript_feature_file)) # check that output dir is not a regular file if os.path.exists( args.output_dir) and (not os.path.isdir(args.output_dir)): parser.error("Output directory name '%s' exists and is not a valid " "directory" % (args.output_dir)) # check that bowtie2-build program exists if check_executable(config.BOWTIE2_BUILD_BIN): logging.debug("Checking for '%s' binary... found" % (config.BOWTIE2_BUILD_BIN)) else: parser.error("%s binary not found or not executable" % (config.BOWTIE2_BUILD_BIN)) # run main index creation function retcode = create_chimerascan_index(args.output_dir, args.ref_fasta_file, args.transcript_feature_file) return retcode
def main(): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser(description="Build alignment indexes for use with chimerascan") parser.add_argument("ref_fasta_file", help="reference genome FASTA file") parser.add_argument("transcript_feature_file", help="transcript features") parser.add_argument("output_dir", help="directory where indexes will be created") args = parser.parse_args() # check that input files exist if not os.path.isfile(args.ref_fasta_file): parser.error("Reference fasta file '%s' not found" % (args.ref_fasta_file)) if not os.path.isfile(args.transcript_feature_file): parser.error("Gene feature file '%s' not found" % (args.transcript_feature_file)) # check that output dir is not a regular file if os.path.exists(args.output_dir) and (not os.path.isdir(args.output_dir)): parser.error("Output directory name '%s' exists and is not a valid " "directory" % (args.output_dir)) # check that bowtie2-build program exists if check_executable(config.BOWTIE2_BUILD_BIN): logging.debug("Checking for '%s' binary... found" % (config.BOWTIE2_BUILD_BIN)) else: parser.error("%s binary not found or not executable" % (config.BOWTIE2_BUILD_BIN)) # run main index creation function retcode = create_chimerascan_index(args.output_dir, args.ref_fasta_file, args.transcript_feature_file) return retcode
def main(): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = OptionParser("usage: %prog [options] <reference_genome.fa> " "<genepred_genes.txt> <index_output_dir>") parser.add_option("--bowtie-dir", dest="bowtie_dir", default="", help="Path to the 'bowtie' software (by default, " "expects the 'bowtie' and 'bowtie-build' " "binaries to be in current PATH)") options, args = parser.parse_args() # check command line arguments if len(args) < 3: parser.error("Incorrect number of command line arguments") ref_fasta_file = args[0] gene_feature_file = args[1] output_dir = args[2] # check that input files exist if not os.path.isfile(ref_fasta_file): parser.error("Reference fasta file '%s' not found" % (ref_fasta_file)) if not os.path.isfile(gene_feature_file): parser.error("Gene feature file '%s' not found" % (gene_feature_file)) # check that output dir is not a regular file if os.path.exists(output_dir) and (not os.path.isdir(output_dir)): parser.error("Output directory name '%s' exists and is not a valid " "directory" % (output_dir)) # check that bowtie-build program exists bowtie_build_bin = os.path.join(options.bowtie_dir, "bowtie-build") if check_executable(bowtie_build_bin): logging.debug("Checking for 'bowtie-build' binary... found") else: parser.error("bowtie-build binary not found or not executable") # run main index creation function retcode = create_chimerascan_index(output_dir, ref_fasta_file, gene_feature_file, bowtie_build_bin) sys.exit(retcode)
def _setup_and_open_files(genome_index, transcripts, input_file, output_file, library_type, input_sam, output_sam): # create SAM header from genome index logging.debug("Creating genome SAM header") if not check_executable(config.BOWTIE2_INSPECT_BIN): logging.error("Cannot find bowtie2-inspect binary") return config.JOB_ERROR # get references/lengths from bowtie2 ref_list = get_references_from_bowtie2_index(genome_index) # open input BAM file and add to header if input_sam: mode = "r" else: mode = "rb" infh = pysam.Samfile(input_file, mode) header_dict = dict(infh.header) header_dict['SQ'] = [{ 'SN': seqname, 'LN': seqlen } for seqname, seqlen in ref_list] # open output BAM file with new header if output_sam: mode = "wh" else: mode = "wb" outfh = pysam.Samfile(output_file, mode, header=header_dict) # setup reference name mappings genome_rname_tid_map = dict( (rname, i) for i, rname in enumerate(outfh.references)) transcriptome_rname_tid_map = dict( (rname, i) for i, rname in enumerate(infh.references)) # read transcript feature and prepare data structure for conversion logging.debug("Creating transcript to genome map") transcript_tid_map = {} for t in transcripts: exons = [(start, end) for start, end in t.exons] negstrand = True if t.strand == "-" else False if negstrand: exons.reverse() transcript_tid = transcriptome_rname_tid_map[str(t.tx_id)] genome_tid = genome_rname_tid_map[t.chrom] transcript_tid_map[transcript_tid] = (genome_tid, negstrand, exons) return infh, outfh, transcript_tid_map
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = OptionParser("usage: %prog [options] <reference_genome.fa> " "<gene_models.txt> <index_output_dir>") #parser.add_option('-i', '--min-fragment-size', dest="min_fragment_size", default=0) #parser.add_option('-I', '--max-fragment-size', dest="max_fragment_size", default=700) parser.add_option("--bowtie-dir", dest="bowtie_dir", default="", help="Path to the 'bowtie' software (by default, " "expects the 'bowtie' and 'bowtie-build' " "binaries to be in current PATH)") options, args = parser.parse_args() # check command line arguments if len(args) < 3: parser.error("Incorrect number of command line arguments") ref_fasta_file = args[0] gene_feature_file = args[1] output_dir = args[2] # check that input files exist if not os.path.isfile(ref_fasta_file): parser.error("Reference fasta file '%s' not found" % (ref_fasta_file)) if not os.path.isfile(gene_feature_file): parser.error("Gene feature file '%s' not found" % (gene_feature_file)) # check that output dir is not a regular file if os.path.exists(output_dir) and (not os.path.isdir(output_dir)): parser.error("Output directory name '%s' exists and is not a valid " "directory" % (output_dir)) # check that bowtie-build program exists bowtie_build_bin = os.path.join(options.bowtie_dir, "bowtie-build") if check_executable(bowtie_build_bin): logging.debug("Checking for 'bowtie-build' binary... found") else: parser.error("bowtie-build binary not found or not executable") # run main index creation function retcode = create_chimerascan_index(output_dir, ref_fasta_file, gene_feature_file, bowtie_build_bin) # min_fragment_size=options.min_fragment_size, # max_fragment_size=options.max_fragment_size) sys.exit(retcode)
def _setup_and_open_files(genome_index, transcripts, input_file, output_file, library_type, input_sam, output_sam): # create SAM header from genome index logging.debug("Creating genome SAM header") if not check_executable(config.BOWTIE2_INSPECT_BIN): logging.error("Cannot find bowtie2-inspect binary") return config.JOB_ERROR # get references/lengths from bowtie2 ref_list = get_references_from_bowtie2_index(genome_index) # open input BAM file and add to header if input_sam: mode = "r" else: mode = "rb" infh = pysam.Samfile(input_file, mode) header_dict = dict(infh.header) header_dict['SQ'] = [{'SN': seqname, 'LN': seqlen} for seqname,seqlen in ref_list] # open output BAM file with new header if output_sam: mode = "wh" else: mode = "wb" outfh = pysam.Samfile(output_file, mode, header=header_dict) # setup reference name mappings genome_rname_tid_map = dict((rname,i) for i,rname in enumerate(outfh.references)) transcriptome_rname_tid_map = dict((rname,i) for i,rname in enumerate(infh.references)) # read transcript feature and prepare data structure for conversion logging.debug("Creating transcript to genome map") transcript_tid_map = {} for t in transcripts: exons = [(start, end) for start, end in t.exons] negstrand = True if t.strand == "-" else False if negstrand: exons.reverse() transcript_tid = transcriptome_rname_tid_map[str(t.tx_id)] genome_tid = genome_rname_tid_map[t.chrom] transcript_tid_map[transcript_tid] = (genome_tid, negstrand, exons) return infh, outfh, transcript_tid_map
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = OptionParser( "usage: %prog [options] <reference_genome.fa> <gene_models.txt> <index_output_dir>" ) parser.add_option("--bowtie-build-bin", dest="bowtie_build_bin", default="bowtie-build", help="Path to 'bowtie-build' program") options, args = parser.parse_args() # check command line arguments if len(args) < 3: parser.error("Incorrect number of command line arguments") ref_fasta_file = args[0] gene_feature_file = args[1] output_dir = args[2] # check that input files exist if not os.path.isfile(ref_fasta_file): parser.error("Reference fasta file '%s' not found" % (ref_fasta_file)) if not os.path.isfile(gene_feature_file): parser.error("Gene feature file '%s' not found" % (gene_feature_file)) # check that output dir is not a regular file if os.path.exists(output_dir) and (not os.path.isdir(output_dir)): parser.error( "Output directory name '%s' exists and is not a valid directory" % (output_dir)) # check that bowtie-build program exists if check_executable(options.bowtie_build_bin): logging.debug("Checking for 'bowtie-build' binary... found") else: parser.error("bowtie-build binary not found or not executable") # run main index creation function retcode = create_chimerascan_index(output_dir, ref_fasta_file, gene_feature_file, options.bowtie_build_bin) sys.exit(retcode)
def check_config(self): # check that input fastq files exist config_passed = True for mate, fastq_file in enumerate(self.fastq_files): if not os.path.isfile(fastq_file): logging.error("mate '%d' fastq file '%s' is not valid" % (mate, fastq_file)) config_passed = False # check read lengths with trimming applied logging.debug("Checking read lengths") read_lengths = [detect_read_length(fq) for fq in self.fastq_files] total_trimming = self.trim5 + self.trim3 for i, rlen in enumerate(read_lengths): trimmed_rlen = rlen - total_trimming logging.debug("File %s read length: %d after trimming: %d" % (self.fastq_files[i], rlen, trimmed_rlen)) if trimmed_rlen < config.MIN_SEGMENT_LENGTH: logging.error( "Trimmed read length is less than the minimum length of %d" % (trimmed_rlen, config.MIN_SEGMENT_LENGTH)) config_passed = False # check that mate read lengths are equal if len(set(read_lengths)) > 1: logging.error("Unequal read lengths mate1=%d and mate2=%d" % (read_lengths[0], read_lengths[1])) config_passed = False # check that seed length < read length if self.segment_length is not None: if any((self.segment_length > rlen) for rlen in read_lengths): logging.error( "seed length %d cannot be longer than read length" % (self.segment_length)) config_passed = False # ensure local anchor length is larger than minimum if self.local_anchor_length < config.LOCAL_ANCHOR_LENGTH_MIN: logging.error( "Local anchor length of %d < %d" % (self.local_anchor_length, config.LOCAL_ANCHOR_LENGTH_MIN)) config_passed = False # check that output dir is not a regular file if os.path.exists( self.output_dir) and (not os.path.isdir(self.output_dir)): logging.error( "Output directory name '%s' exists and is not a valid directory" % (self.output_dir)) config_passed = False if check_executable(config.BOWTIE2_BUILD_BIN): logging.debug("Checking for '%s' binary... found" % config.BOWTIE2_BUILD_BIN) else: logging.error("%s binary not found or not executable" % config.BOWTIE2_BUILD_BIN) config_passed = False # check that bowtie program exists if check_executable(os.path.join(config.BOWTIE2_BIN)): logging.debug("Checking for '%s' binary... found" % config.BOWTIE2_BIN) else: logging.error("%s binary not found or not executable" % config.BOWTIE2_BIN) config_passed = False # check that alignment index exists if os.path.isdir(self.index_dir): logging.debug("Checking for chimerascan index directory... found") # check that alignment index files exist for f in config.TRANSCRIPTOME_BOWTIE2_FILES: filename = os.path.join(self.index_dir, f) if not os.path.isfile(filename): logging.error("chimerascan index file '%s' invalid" % (filename)) config_passed = False break for f in config.GENOME_BOWTIE2_FILES: filename = os.path.join(self.index_dir, f) if not os.path.isfile(filename): logging.error("chimerascan index file '%s' invalid" % (filename)) config_passed = False break else: logging.error( "chimerascan alignment index directory '%s' not valid" % (self.index_dir)) config_passed = False # check for sufficient processors if self.num_processors < config.BASE_PROCESSORS: logging.warning( "Please specify >=2 processes using '-p' to allow program to run efficiently" ) return config_passed
def check_config(self): # check that input fastq files exist config_passed = True for mate,fastq_file in enumerate(self.fastq_files): if not os.path.isfile(fastq_file): logging.error("mate '%d' fastq file '%s' is not valid" % (mate, fastq_file)) config_passed = False # check read lengths with trimming applied logging.debug("Checking read lengths") read_lengths = [detect_read_length(fq) for fq in self.fastq_files] total_trimming = self.trim5 + self.trim3 for i,rlen in enumerate(read_lengths): trimmed_rlen = rlen - total_trimming logging.debug("File %s read length: %d after trimming: %d" % (self.fastq_files[i], rlen, trimmed_rlen)) if trimmed_rlen < config.MIN_SEGMENT_LENGTH: logging.error("Trimmed read length is less than the minimum length of %d" % (trimmed_rlen, config.MIN_SEGMENT_LENGTH)) config_passed = False # check that mate read lengths are equal if len(set(read_lengths)) > 1: logging.error("Unequal read lengths mate1=%d and mate2=%d" % (read_lengths[0], read_lengths[1])) config_passed = False # check that seed length < read length if self.segment_length is not None: if any((self.segment_length > rlen) for rlen in read_lengths): logging.error("seed length %d cannot be longer than read length" % (self.segment_length)) config_passed = False # ensure local anchor length is larger than minimum if self.local_anchor_length < config.LOCAL_ANCHOR_LENGTH_MIN: logging.error("Local anchor length of %d < %d" % (self.local_anchor_length, config.LOCAL_ANCHOR_LENGTH_MIN)) config_passed = False # check that output dir is not a regular file if os.path.exists(self.output_dir) and (not os.path.isdir(self.output_dir)): logging.error("Output directory name '%s' exists and is not a valid directory" % (self.output_dir)) config_passed = False if check_executable(config.BOWTIE2_BUILD_BIN): logging.debug("Checking for '%s' binary... found" % config.BOWTIE2_BUILD_BIN) else: logging.error("%s binary not found or not executable" % config.BOWTIE2_BUILD_BIN) config_passed = False # check that bowtie program exists if check_executable(os.path.join(config.BOWTIE2_BIN)): logging.debug("Checking for '%s' binary... found" % config.BOWTIE2_BIN) else: logging.error("%s binary not found or not executable" % config.BOWTIE2_BIN) config_passed = False # check that alignment index exists if os.path.isdir(self.index_dir): logging.debug("Checking for chimerascan index directory... found") # check that alignment index files exist for f in config.TRANSCRIPTOME_BOWTIE2_FILES: filename = os.path.join(self.index_dir, f) if not os.path.isfile(filename): logging.error("chimerascan index file '%s' invalid" % (filename)) config_passed = False break for f in config.GENOME_BOWTIE2_FILES: filename = os.path.join(self.index_dir, f) if not os.path.isfile(filename): logging.error("chimerascan index file '%s' invalid" % (filename)) config_passed = False break else: logging.error("chimerascan alignment index directory '%s' not valid" % (self.index_dir)) config_passed = False # check for sufficient processors if self.num_processors < config.BASE_PROCESSORS: logging.warning("Please specify >=2 processes using '-p' to allow program to run efficiently") return config_passed
def check_config(self): # check that input fastq files exist config_passed = True read_lengths = [] for mate, fastq_file in enumerate(self.fastq_files): if not os.path.isfile(fastq_file): logging.error("mate '%d' fastq file '%s' is not valid" % (mate, fastq_file)) config_passed = False read_lengths.append(get_read_length(fastq_file)) logging.debug("Checking file %s" % (fastq_file)) logging.debug("File %s read length=%d" % (fastq_file, read_lengths[-1])) # check that mate read lengths are equal if len(set(read_lengths)) > 1: logging.error("Unequal read lengths mate1=%d and mate2=%d" % (read_lengths[0], read_lengths[1])) config_passed = False # check that seed length < read length if any(self.segment_length > rlen for rlen in read_lengths): logging.error("seed length %d cannot be longer than read length" % (self.segment_length)) config_passed = False # check that output dir is not a regular file if os.path.exists( self.output_dir) and (not os.path.isdir(self.output_dir)): logging.error( "Output directory name '%s' exists and is not a valid directory" % (self.output_dir)) config_passed = False if check_executable(self.bowtie_build_bin): logging.debug("Checking for 'bowtie-build' binary... found") else: logging.error("bowtie-build binary not found or not executable") config_passed = False # check that bowtie program exists if check_executable(self.bowtie_bin): logging.debug("Checking for 'bowtie' binary... found") else: logging.error("bowtie binary not found or not executable") config_passed = False # check that alignment index exists if os.path.isdir(self.index_dir): logging.debug("Checking for chimerascan index directory... found") # check that alignment index file exists align_index_file = os.path.join(self.index_dir, config.BOWTIE_INDEX_FILE) if os.path.isfile(align_index_file): logging.debug("Checking for bowtie index file... found") else: logging.error("chimerascan bowtie index file '%s' invalid" % (align_index_file)) config_passed = False else: logging.error( "chimerascan alignment index directory '%s' not valid" % (self.index_dir)) config_passed = False # check for sufficient processors if self.num_processors < config.BASE_PROCESSORS: logging.warning( "Please specify >=2 processes using '-p' to allow program to run efficiently" ) return config_passed