help='Basename for index to be written') parser.add_argument(\ '--keep-alive', action='store_const', const=True, default=False, help='Prints reporter:status:alive messages to stderr to keep EMR ' 'task alive') filemover.add_args(parser) bowtie.add_args(parser) tempdel.add_args(parser) args = parser.parse_args() import time start_time = time.time() output_filename, output_stream, output_url = [None] * 3 output_url = Url(args.out) if args.out is not None \ else Url(os.getcwd()) # Set up temporary destination import tempfile temp_dir_path = make_temp_dir(tempdel.silentexpandvars(args.scratch)) # For deleting temporary directory, even on unexpected exit register_cleanup(tempdel.remove_temporary_directories, [temp_dir_path]) # Set up temporary destination try: os.makedirs(os.path.join(temp_dir_path, 'index')) except: pass # Write to temporary directory, and later upload to URL index_basename = os.path.join(temp_dir_path, 'index/' + args.basename) fasta_file = os.path.join(temp_dir_path, 'temp.fa') print >> sys.stderr, 'Opened %s for writing....' % fasta_file
formed; reference_index.rname_lengths[RNAME] is the length of RNAME.''' reference_index = bowtie_index.BowtieIndexReference( os.path.expandvars(args.bowtie_idx)) # For mapping sample indices back to original sample labels manifest_object = manifest.LabelsAndIndices(os.path.expandvars(args.manifest)) # Create file with chromosome sizes for bedTobigwig sizes_filename = os.path.join(temp_dir_path, 'chrom.sizes') if args.verbose: print >> sys.stderr, 'Sizes file: %s .' % sizes_filename with open(sizes_filename, 'w') as sizes_stream: for rname in reference_index.rname_lengths: print >> sizes_stream, '%s %d' % (rname, reference_index.rname_lengths[rname]) input_line_count, output_line_count = 0, 0 output_url = Url(args.out) if output_url.is_local: # Set up destination directory try: os.makedirs(output_url.to_url()) except: pass mover = filemover.FileMover(args=args) track_line = ('track type=bedGraph name="{name}" ' 'description="{description}" visibility=full ' 'color=227,29,118 altColor=0,179,220 priority=400') for (sample_index, ), xpartition in xstream(sys.stdin, 1): try: sample_label = manifest_object.index_to_label[sample_index] except KeyError: # It's a mean or median
def go(input_stream=sys.stdin, output_stream=sys.stdout, bowtie2_exe='bowtie2', bowtie2_index_base='genome', bowtie2_args='', verbose=False, report_multiplier=1.2, stranded=False, fudge=5, score_min=60, gzip_level=3, mover=filemover.FileMover(), intermediate_dir='.', scratch=None): """ Runs Rail-RNA-cointron_enum Alignment script for MapReduce pipelines that wraps Bowtie 2. Finds introns that cooccur on reads by local alignments to transcriptome elements from Bowtie 2. Input (read from stdin) ---------------------------- Tab-delimited output tuple columns (readletize) 1. SEQ or its reversed complement, whichever is first in alphabetical order 2. Comma-separated list of sample labels if field 1 is the read sequence; '\x1c' if empty 3. Comma-separated list of sample labels if field 1 is the reversed complement of the read sequence; '\x1c' if empty Hadoop output (written to stdout) ---------------------------- Tab-delimited tuple columns: 1. Reference name (RNAME in SAM format) + '+' or '-' indicating which strand is the sense strand 2. Comma-separated list of intron start positions in configuration 3. Comma-separated list of intron end positions in configuration 4. left_extend_size: by how many bases on the left side of an intron the reference should extend 5. right_extend_size: by how many bases on the right side of an intron the reference should extend 6. Read sequence input_stream: where to find input reads. output_stream: where to emit exonic chunks and introns. bowtie2_exe: filename of Bowtie 2 executable; include path if not in $PATH. bowtie2_index_base: the basename of the Bowtie index files associated with the reference. bowtie2_args: string containing precisely extra command-line arguments to pass to Bowtie 2, e.g., "--tryhard --best"; or None. verbose: True iff more informative messages should be written to stderr. report_multiplier: if verbose is True, the line number of an alignment written to stderr increases exponentially with base report_multiplier. stranded: True iff input reads are strand-specific; this affects whether an output partition has a terminal '+' or '-' indicating the sense strand. Further, if stranded is True, an alignment is returned only if its strand agrees with the intron's strand. fudge: by how many bases to extend left and right extend sizes to accommodate potential indels score_min: Bowtie2 CONSTANT minimum alignment score gzip_level: compression level to use for temporary files mover: FileMover object, for use in case Bowtie2 idx needs to be pulled from S3 intermediate_dir: where intermediates are stored; for temporarily storing transcript index if it needs to be pulled from S3 scratch: scratch directory for storing temporary files or None if securely created temporary directory No return value. """ bowtie2_index_base_url = Url(bowtie2_index_base) if bowtie2_index_base_url.is_s3: index_basename = os.path.basename(bowtie2_index_base) index_directory = os.path.join(intermediate_dir, 'transcript_index') if not os.path.exists(os.path.join(index_directory, '_STARTED')): # Download index with open(os.path.join(index_directory, '_STARTED'), 'w') \ as started_stream: print >> started_stream, 'STARTED' for extension in [ '.1.bt2', '.2.bt2', '.3.bt2', '.4.bt2', '.rev.1.bt2', '.rev.2.bt2' ]: mover.get(bowtie2_index_base_url, index_directory) with open(os.path.join(index_directory, '_SUCCESS'), 'w') \ as success_stream: print >> success_stream, 'SUCCESS' while not os.path.exists(os.path.join(index_directory, '_SUCCESS')): time.sleep(0.5) bowtie2_index_base = os.path.join(index_directory, index_basename) global _input_line_count temp_dir_path = make_temp_dir(scratch) register_cleanup(tempdel.remove_temporary_directories, [temp_dir_path]) reads_file = os.path.join(temp_dir_path, 'reads.temp.gz') with xopen(True, reads_file, 'w', gzip_level) as reads_stream: for _input_line_count, line in enumerate(input_stream): seq = line.strip() print >> reads_stream, '\t'.join([seq, seq, 'I' * len(seq)]) input_command = 'gzip -cd %s' % reads_file bowtie_command = ' '.join([ bowtie2_exe, bowtie2_args if bowtie2_args is not None else '', ' --local -t --no-hd --mm -x', bowtie2_index_base, '--12 -', '--score-min L,%d,0' % score_min, '-D 24 -R 3 -N 1 -L 20 -i L,4,0' ]) delegate_command = ''.join([ sys.executable, ' ', os.path.realpath(__file__)[:-3], '_delegate.py --report-multiplier %08f --fudge %d %s %s' % (report_multiplier, fudge, '--stranded' if stranded else '', '--verbose' if verbose else '') ]) full_command = ' | '.join( [input_command, bowtie_command, delegate_command]) print >> sys.stderr, 'Starting Bowtie2 with command: ' + full_command bowtie_process = subprocess.Popen(' '.join( ['set -exo pipefail;', full_command]), bufsize=-1, stdout=sys.stdout, stderr=sys.stderr, shell=True, executable='/bin/bash') return_code = bowtie_process.wait() if return_code: raise RuntimeError('Error occurred while reading Bowtie 2 output; ' 'exitlevel was %d.' % return_code)
for input_line_count, line in enumerate(sys.stdin): # Kill offset from start of manifest file tokens = line.strip().split('\t')[1:] try: stripped = tokens[0].strip() if stripped[0] == '#' or not line.strip(): continue except IndexError: continue token_count = len(tokens) assert token_count in [ 3, 5 ], ('Line {} of input has {} fields, but 3 or 5 are expected.').format( input_line_count + 1, token_count) file_to_count = tokens[0] if (not ((token_count == 3 and Url(tokens[0]).is_local) or (token_count == 5 and Url(tokens[0]).is_local and Url(tokens[2]).is_local))): sys.stdout.write(line) output_line_count += 1 continue with xopen(None, file_to_count) as input_stream: first_char = input_stream.readline()[0] if first_char in fastq_cues: # 4 lines per record line_divider = 4 elif first_char in fasta_cues: line_divider = 2 else: raise RuntimeError( 'File "{}" is neither a FASTA nor a FASTQ file.'.format(