return (sorted_list[index] + sorted_list[index + 1]) / 2.0 library_size = args.library_size * 1000000 start_time = time.time() input_line_count, output_line_count = 0, 0 bin_count = 0 # For converting RNAMEs to number strings reference_index = bowtie_index.BowtieIndexReference( os.path.expandvars(args.bowtie_idx) ) manifest_object = manifest.LabelsAndIndices( os.path.expandvars(args.manifest) ) # Grab read counts mapped_read_counts, unique_mapped_read_counts = {}, {} with xopen(None, args.read_counts) as read_count_stream: read_count_stream.readline() for line in read_count_stream: tokens = line.strip().split('\t') sample_index = manifest_object.label_to_index[tokens[0]] (mapped_read_counts[sample_index], unique_mapped_read_counts[sample_index]) = [ int(token) for token in tokens[-2].split(',') ] try: mean_weight = 1. / len([_ for _ in mapped_read_counts.values() if _]) except ZeroDivisionError: mean_weight = 0.0 try: unique_mean_weight = 1. / len(
def go(nucleotides_per_input=8000000, gzip_output=True, gzip_level=3, to_stdout=False, push='.', mover=filemover.FileMover(), verbose=False, scratch=None, bin_qualities=True, short_qnames=False, skip_bad_records=False, workspace_dir=None, fastq_dump_exe='fastq-dump', ignore_missing_sra_samples=False): """ Runs Rail-RNA-preprocess Input (read from stdin) ---------------------------- Tab-separated fields: ---If URL is local: 1. #!splitload 2. \x1d-separated list of 0-based indexes of reads at which to start each new file 3. \x1d-separated list of numbers of reads to include in gzipped files 4. \x1d-separated list of manifest lines whose tabs are replaced by \x1es ---Otherwise: manifest line A manifest line has the following format (for single-end reads) <URL>(tab)<Optional MD5>(tab)<Sample label> (for paired-end reads) <URL 1>(tab)<Optional MD5 1>(tab)<URL 2>(tab)<Optional MD5 2>(tab) <Sample label> Hadoop output (written to stdout) ---------------------------- None. Other output (written to directory specified by command-line parameter --push) ____________________________ Files containing input data in one of the following formats: Format 1 (single-end, 3-column): 1. Nucleotide sequence or its reversed complement, whichever is first in alphabetical order 2. 1 if sequence was reverse-complemented else 0 3. Name 4. Quality sequence or its reverse, whichever corresponds to field 1 Format 2 (paired, 2 lines, 3 columns each) (so this is the same as single-end) 1. Nucleotide sequence for mate 1 or its reversed complement, whichever is first in alphabetical order 2. 1 if sequence was reverse-complemented else 0 3. Name for mate 1 4. Quality sequence for mate 1 or its reverse, whichever corresponds to field 1 (new line) 1. Nucleotide sequence for mate 2 or its reversed complement, whichever is first in alphabetical order 2. 1 if sequence was reverse complemented else 0 3. Name for mate 2 4. Quality sequence for mate 2 or its reverse, whichever corresponds to field 1 Quality sequences are strings of Is for FASTA input. nucleotides_per_input: maximum number of nucleotides to put in a given input file gzip_output: True iff preprocessed input should be gzipped gzip_level: level of gzip compression to use push: where to send output verbose: True iff extra debugging statements should be printed to stderr scratch: scratch directory for storing temporary files or None if securely created temporary directory bin_qualities: True iff quality string should be binned according to rules in _mismatch_penalties_to_quality_scores and round_quality_string() defined in go() short_qnames: True iff original qname should be killed and a new qname should be written in a short base64-encoded format skip_bad_records: True iff bad records should be skipped; otherwise, raises exception if bad record is encountered workspace_dir: where to use fastq-dump -- needed for working with dbGaP data. None if temporary dir should be used. fastq_dump_exe: path to fastq-dump executable ignore_missing_sra_samples: does not return error if fastq-dump doesn't find a sample No return value """ if bin_qualities: import math def round_quality_string(qual): """ Bins phred+33 quality string to improve compression. Uses 5-bin scheme that does not affect Bowtie 2 alignments qual: quality string Return value: "binned" quality string. """ return ''.join( [str(int( _MN + math.floor((_MX - _MN) * min( ord(qual_char) - 33.0, 40.0 ) / 40.0) )) for qual_char in qual]).translate( _mismatch_penalties_to_quality_scores ) else: def round_quality_string(qual): """ Leaves quality string unbinned and untouched. qual: quality string Return value: qual """ return qual global _input_line_count, _output_line_count skip_stubs = False temp_dir = make_temp_dir(scratch) print >>sys.stderr, 'Created local destination directory "%s".' % temp_dir register_cleanup(tempdel.remove_temporary_directories, [temp_dir]) input_line_count, output_line_count = 0, 0 if not to_stdout: push_url = Url(push) if push_url.is_local: destination = push elif push_url.is_s3 or push_url.is_hdfs or push_url.is_nfs: destination = temp_dir else: raise RuntimeError('Push destination must be ' 'on S3, HDFS, NFS, or local.') fastq_cues = set(['@']) fasta_cues = set(['>', ';']) source_dict = {} onward = False for line in sys.stdin: _input_line_count += 1 if not line.strip(): continue # Kill offset from start of manifest file try: tokens = line.strip().split('\t')[1:] if tokens[0][0] == '#' and tokens[0] != '#!splitload': # Comment line continue except IndexError: # Be robust to bad lines continue token_count = len(tokens) qual_getter = None if tokens[0] == '#!splitload': '''Line specifies precisely how records from files should be placed.''' assert not to_stdout, ('Split manifest line inconsistent with ' 'writing to stdout.') qual_getter = phred_converter(phred_format=tokens[-1]) indexes = tokens[1].split('\x1d') read_counts = tokens[2].split('\x1d') manifest_lines = [token.split('\x1e') for token in tokens[3].split('\x1d')] assert len(indexes) == len(read_counts) == len(manifest_lines) for i, manifest_line in enumerate(manifest_lines): manifest_line_field_count = len(manifest_line) if manifest_line_field_count == 3: source_dict[(Url(manifest_line[0]),)] = ( manifest_line[-1], int(indexes[i]), int(read_counts[i]) ) else: assert manifest_line_field_count == 5 source_dict[(Url(manifest_line[0]), Url(manifest_line[2]))] = ( manifest_line[-1], int(indexes[i]), int(read_counts[i]) ) elif token_count == 3: # SRA or single-end reads source_dict[(Url(tokens[0]),)] = (tokens[-1],) elif token_count == 5: # Paired-end reads source_dict[(Url(tokens[0]), Url(tokens[2]))] = (tokens[-1],) else: # Not a valid line, but continue for robustness continue file_number = 0 for source_urls in source_dict: sample_label = source_dict[source_urls][0] downloaded = set() sources = [] records_printed = 0 if len(source_dict[source_urls]) == 3: skip_count = source_dict[source_urls][1] if len(source_urls) == 2: records_to_consume = source_dict[source_urls][2] if skip_count % 2: skip_count -= 1 records_to_consume += 1 if records_to_consume % 2: records_to_consume -= 1 # Index reads according to order in input to shorten read names read_index = skip_count / 2 # Index reads in pairs else: records_to_consume = source_dict[source_urls][2] read_index = skip_count else: skip_count = 0 records_to_consume = None # Consume all records read_index = 0 assert (records_to_consume >= 0 or records_to_consume is None), ( 'Negative value %d of records to consume encountered.' ) % records_to_consume if records_to_consume == 0: continue skipped = False for source_url in source_urls: if not source_url.is_local: # Download print >>sys.stderr, 'Retrieving URL "%s"...' \ % source_url.to_url() if source_url.is_dbgap: download_dir = workspace_dir elif source_url.is_sra: download_dir = temp_dir if source_url.is_sra: sra_accession = source_url.to_url() fastq_dump_command = ( 'set -exo pipefail; cd {download_dir}; ' '{fastq_dump_exe} -I -X 10000 --split-files ' '{sra_accession}' ).format(download_dir=download_dir, fastq_dump_exe=fastq_dump_exe, sra_accession=sra_accession) try: subprocess.check_call( fastq_dump_command, shell=True, executable='/bin/bash', stdout=sys.stderr ) except subprocess.CalledProcessError as e: if e.returncode == 3 and ignore_missing_sra_samples: onward = True break else: raise RuntimeError( ('Error "%s" encountered executing ' 'command "%s".') % (e.output, fastq_dump_command)) import glob sra_fastq_files = sorted( glob.glob(os.path.join(download_dir, '%s[_.]*' % sra_accession)) ) # ensure 1 before 2 if paired-end # Schedule for deletion def silent_remove(filename): try: os.remove(filename) except OSError as e: pass for sra_fastq_file in sra_fastq_files: register_cleanup(silent_remove, sra_fastq_file) sra_file_count = len(sra_fastq_files) check_for_paired = False if sra_file_count == 1: sra_paired_end = False print >>sys.stderr, 'Detected single-end SRA sample.' elif sra_file_count in [2, 3]: print >>sys.stderr, ('2 or 3 FASTQ files detected. ' 'Checking for barcodes...') check_for_paired = True else: raise RuntimeError( ('Unexpected number of files "%d" output ' 'by fastq-dump command "%s".') % (sra_file_count, fastq_dump_command) ) if check_for_paired: # Get max/min read lengths from FASTQ with open( sra_fastq_files[sra_file_count - 2] ) as fastq_stream: max_len, min_len = ( max_min_read_lengths_from_fastq_stream( fastq_stream ) ) print >>sys.stderr, ( 'Max/min read length found in candidate ' 'barcode FASTQ was {}/{}.' ).format(max_len, min_len) if max_len <= _max_stubby_read_length: print >>sys.stderr, ( 'Assumed barcode FASTQ.' ) skip_stubs = True if sra_file_count == 2: sra_paired_end = False else: sra_paired_end = True else: if sra_file_count == 2: sra_paired_end = True else: raise RuntimeError( '3 FASTQs detected, but one of them ' 'was not recognized as containing ' 'barcodes.' ) # Guess quality from first 10k lines with xopen(None, sra_fastq_files[0]) as source_stream: qual_getter = phred_converter( fastq_stream=source_stream ) for sra_fastq_file in sra_fastq_files: os.remove(sra_fastq_file) sources.append(os.devnull) fastq_dump_command = ( 'set -exo pipefail; cd {download_dir}; ' '{fastq_dump_exe} --split-spot -I --stdout ' '{sra_accession}' ).format(download_dir=download_dir, fastq_dump_exe=fastq_dump_exe, sra_accession=sra_accession) if skip_stubs: fastq_dump_command += ( ' | awk \'BEGIN {{OFS = "\\n"}} ' '{{header = $0; ' 'getline seq; getline qheader; getline qseq; ' 'if (length(seq) > {min_len}) {{print header, ' 'seq, qheader, qseq}}}}\'' ).format(min_len=_max_stubby_read_length) print >>sys.stderr, fastq_dump_command sra_process = subprocess.Popen(fastq_dump_command, shell=True, executable='/bin/bash', stdout=subprocess.PIPE, bufsize=-1) else: mover.get(source_url, temp_dir) downloaded = list( set(os.listdir(temp_dir)).difference(downloaded) ) sources.append(os.path.join(temp_dir, list(downloaded)[0])) else: sources.append(source_url.to_url()) if onward: continue '''Use os.devnull so single- and paired-end data can be handled in one loop.''' if len(sources) == 1: sources.append(os.devnull) if qual_getter is None: # Figure out Phred format with xopen(None, sources[0]) as source_stream: qual_getter = phred_converter(fastq_stream=source_stream) with xopen(None, sources[0]) as source_stream_1, xopen( None, sources[1] ) as source_stream_2: source_streams = [source_stream_1, source_stream_2] reorganize = all([source == os.devnull for source in sources]) if reorganize: # SRA data is live if sra_paired_end: source_streams = [sra_process.stdout, sra_process.stdout] else: source_streams = [sra_process.stdout, open(os.devnull)] break_outer_loop = False while True: if not to_stdout: '''Name files using Hadoop task environment property mapred.task.partition.''' if gzip_output: try: output_file = os.path.join( destination, '.'.join([ os.environ['mapred_task_partition'], str(file_number), 'gz' ]) ) except KeyError: '''Hadoop 2.x: mapreduce.task.partition; see http://hadoop.apache.org/docs/r2.0.3-alpha/ hadoop-project-dist/hadoop-common/ DeprecatedProperties.html.''' output_file = os.path.join( destination, '.'.join([ os.environ['mapreduce_task_partition'], str(file_number), 'gz' ]) ) open_args = [output_file, 'a', gzip_level] else: try: output_file = os.path.join( destination, '.'.join([ os.environ['mapred_task_partition'], str(file_number) ]) ) except KeyError: output_file = os.path.join( destination, '.'.join([ os.environ['mapreduce_task_partition'], str(k), str(file_number) ]) ) open_args = [output_file, 'a'] try: os.makedirs(os.path.dirname(output_file)) except OSError: pass else: open_args = [] '''Use xopen to handle compressed streams and normal streams generally.''' with xopen(gzip_output if not to_stdout else '-', *open_args) \ as output_stream: perform_push = False line_numbers = [0, 0] read_next_line = True nucs_read = 0 pairs_read = 0 while True: if read_next_line: # Read next line only if FASTA mode didn't already lines = [] for source_stream in source_streams: lines.append(source_stream.readline()) read_next_line = True if not lines[0]: break_outer_loop = True break line_numbers = [i + 1 for i in line_numbers] lines = [line.strip() for line in lines] bad_record_skip = False if lines[0][0] in fastq_cues: if records_to_consume and not skipped: '''Skip lines as necessary; for paired-end reads skip the largest even number of records less than records_to_consume.''' if len(source_urls) == 1: # single-end line_skip_count = max( skip_count * 4 - 1, 0 ) else: # paired-end line_skip_count = max( ((skip_count / 2) * 4 - 1), 0 ) for _ in xrange(line_skip_count): next(source_stream_2) for _ in xrange(line_skip_count): next(source_stream_1) if skip_count: lines = [] for source_stream in source_streams: lines.append(source_stream.readline()) if not lines[0]: break_outer_loop = True break lines = [line.strip() for line in lines] skipped = True seqs = [source_stream.readline().strip() for source_stream in source_streams] line_numbers = [i + 1 for i in line_numbers] plus_lines = [source_stream.readline().strip() for source_stream in source_streams] line_numbers = [i + 1 for i in line_numbers] quals = [source_stream.readline().strip() for source_stream in source_streams] if reorganize and sra_paired_end: # Fix order! lines, seqs, plus_lines, quals = ( [lines[0], plus_lines[0]], [lines[1], plus_lines[1]], [seqs[0], quals[0]], [seqs[1], quals[1]] ) try: assert plus_lines[0][0] == '+', ( 'Malformed read "%s" at line %d of ' 'file "%s".' ) % (lines[0], line_numbers[0], sources[0]) if plus_lines[1]: assert plus_lines[1][0] == '+', ( 'Malformed read "%s" at line %d ' 'of file "%s".' ) % ( lines[1], line_numbers[1], sources[1] ) try: # Kill spaces in name original_qnames = \ [line[1:].replace(' ', '_') for line in lines] except IndexError: raise RuntimeError( 'Error finding QNAME at ' 'line %d of either %s or %s' % ( sources[0], sources[1] ) ) except (AssertionError, IndexError, RuntimeError) as e: if skip_bad_records: print >>sys.stderr, ('Error "%s" ' 'encountered; skipping bad record.' ) % e.message for source_stream in source_streams: source_stream.readline() line_numbers = [ i + 1 for i in line_numbers ] bad_record_skip = True else: raise else: try: quals = [ qual_getter(qual) for qual in quals ] except Exception as e: if skip_bad_records: print >>sys.stderr, ( 'Error "%s" encountered ' 'trying to convert quality ' 'string to Sanger format; ' 'skipping bad record.' ) % e.message bad_record_skip = True else: raise line_numbers = [i + 1 for i in line_numbers] try: for i in xrange(2): assert len(seqs[i]) == len(quals[i]), ( 'Length of read sequence does not ' 'match length of quality string ' 'at line %d of file "%s".' ) % (line_numbers[i], sources[i]) except (AssertionError, IndexError) as e: if skip_bad_records: print >>sys.stderr, ( 'Error "%s" encountered; ' 'skipping bad record.' ) % e.message bad_record_skip = True else: raise elif lines[0][0] in fasta_cues: seqs = [[], []] next_lines = [] for p, source_stream in enumerate(source_streams): while True: next_line \ = source_stream.readline().strip() try: if next_line[0] in fasta_cues: break else: try: seqs[p].append(next_line) except IndexError: raise except IndexError: break next_lines.append(next_line) seqs = [''.join(seq) for seq in seqs] line_numbers = [i + 1 for i in line_numbers] try: try: # Kill spaces in name original_qnames = \ [line[1:].replace(' ', '_') for line in lines] except IndexError: raise RuntimeError( 'Error finding QNAME at ' 'line %d of either %s or %s' % ( sources[0], sources[1] ) ) except (AssertionError, IndexError, RuntimeError) as e: if skip_bad_records: print >>sys.stderr, ('Error "%s" ' 'encountered; skipping bad record.' ) % e.message for source_stream in source_streams: source_stream.readline() line_numbers = [ i + 1 for i in line_numbers ] bad_record_skip = True else: raise else: try: quals = [ 'h'*len(seq) for seq in seqs ] except Exception as e: if skip_bad_records: print >>sys.stderr, ( 'Error "%s" encountered ' 'trying to convert quality ' 'string to Sanger format; ' 'skipping bad record.' ) % e.message bad_record_skip = True else: raise line_numbers = [i + 1 for i in line_numbers] lines = next_lines read_next_line = False if bad_record_skip: seqs = [] # Fake record-printing to get to records_to_consume if source_streams[-1].name == os.devnull: records_printed += 1 else: records_printed += 2 elif len(original_qnames) == 2 and original_qnames[1]: # Paired-end write if original_qnames[0] == original_qnames[1]: # Add paired-end identifiers original_qnames[0] += '/1' original_qnames[1] += '/2' assert seqs[1] assert quals[1] seqs = [seq.upper() for seq in seqs] reversed_complement_seqs = [ seqs[0][::-1].translate( _reversed_complement_translation_table ), seqs[1][::-1].translate( _reversed_complement_translation_table ) ] if seqs[0] < reversed_complement_seqs[0]: left_seq = seqs[0] left_qual = quals[0] left_reversed = '0' else: left_seq = reversed_complement_seqs[0] left_qual = quals[0][::-1] left_reversed = '1' if seqs[1] < reversed_complement_seqs[1]: right_seq = seqs[1] right_qual = quals[1] right_reversed = '0' else: right_seq = reversed_complement_seqs[1] right_qual = quals[1][::-1] right_reversed = '1' if short_qnames: left_qname_to_write = encode(read_index) + '/1' right_qname_to_write = encode( read_index ) + '/2' else: left_qname_to_write = original_qnames[0] right_qname_to_write = original_qnames[1] print >>output_stream, '\t'.join( [ left_seq, left_reversed, qname_from_read( left_qname_to_write, seqs[0] + quals[0], sample_label, mate=seqs[1] ), '\n'.join([ round_quality_string( left_qual ), right_seq ]), right_reversed, qname_from_read( right_qname_to_write, seqs[1] + quals[1], sample_label, mate=seqs[0] ), round_quality_string(right_qual) ] ) records_printed += 2 _output_line_count += 1 else: seqs[0] = seqs[0].upper() reversed_complement_seqs = [ seqs[0][::-1].translate( _reversed_complement_translation_table ) ] # Single-end write if seqs[0] < reversed_complement_seqs[0]: seq = seqs[0] qual = quals[0] is_reversed = '0' else: seq = reversed_complement_seqs[0] qual = quals[0][::-1] is_reversed = '1' if short_qnames: qname_to_write = encode(read_index) else: qname_to_write = original_qnames[0] print >>output_stream, '\t'.join( [ seq, is_reversed, qname_from_read( qname_to_write, seqs[0] + quals[0], sample_label ), round_quality_string(qual) ] ) records_printed += 1 _output_line_count += 1 read_index += 1 for seq in seqs: nucs_read += len(seq) if records_printed == records_to_consume: break_outer_loop = True perform_push = True break if not to_stdout and not records_to_consume and \ nucs_read > nucleotides_per_input: file_number += 1 break if verbose: print >>sys.stderr, ( 'Exited with statement; line numbers are %s' % line_numbers ) if (not to_stdout) and (push_url.is_nfs or push_url.is_s3 or push_url.is_hdfs) \ and ((not records_to_consume) or (records_to_consume and perform_push)): print >>sys.stderr, 'Pushing "%s" to "%s" ...' % ( output_file, push_url.to_url() ) print >>sys.stderr, 'reporter:status:alive' mover.put(output_file, push_url.plus(os.path.basename( output_file ))) try: os.remove(output_file) except OSError: pass if break_outer_loop: break if verbose: print >>sys.stderr, 'Exiting source streams...' if verbose: print >>sys.stderr, 'Exited source streams.' # Clear temporary directory for input_file in os.listdir(temp_dir): try: os.remove(os.path.join(temp_dir, input_file)) except OSError: pass if 'sra_process' in locals(): sra_process.stdout.close() sra_return_code = sra_process.wait() if sra_return_code > 0: raise RuntimeError(('fastq-dump terminated with exit ' 'code %d. Command run was "%s".') % (sra_return_code, fastq_dump_command)) del sra_process
def go(input_stream=sys.stdin, output_stream=sys.stdout, bowtie2_exe='bowtie2', bowtie2_index_base='genome', bowtie2_args='', verbose=False, report_multiplier=1.2, stranded=False, fudge=5, score_min=60, gzip_level=3, mover=filemover.FileMover(), intermediate_dir='.', scratch=None): """ Runs Rail-RNA-cointron_enum Alignment script for MapReduce pipelines that wraps Bowtie 2. Finds introns that cooccur on reads by local alignments to transcriptome elements from Bowtie 2. Input (read from stdin) ---------------------------- Tab-delimited output tuple columns (readletize) 1. SEQ or its reversed complement, whichever is first in alphabetical order 2. Comma-separated list of sample labels if field 1 is the read sequence; '\x1c' if empty 3. Comma-separated list of sample labels if field 1 is the reversed complement of the read sequence; '\x1c' if empty Hadoop output (written to stdout) ---------------------------- Tab-delimited tuple columns: 1. Reference name (RNAME in SAM format) + '+' or '-' indicating which strand is the sense strand 2. Comma-separated list of intron start positions in configuration 3. Comma-separated list of intron end positions in configuration 4. left_extend_size: by how many bases on the left side of an intron the reference should extend 5. right_extend_size: by how many bases on the right side of an intron the reference should extend 6. Read sequence input_stream: where to find input reads. output_stream: where to emit exonic chunks and introns. bowtie2_exe: filename of Bowtie 2 executable; include path if not in $PATH. bowtie2_index_base: the basename of the Bowtie index files associated with the reference. bowtie2_args: string containing precisely extra command-line arguments to pass to Bowtie 2, e.g., "--tryhard --best"; or None. verbose: True iff more informative messages should be written to stderr. report_multiplier: if verbose is True, the line number of an alignment written to stderr increases exponentially with base report_multiplier. stranded: True iff input reads are strand-specific; this affects whether an output partition has a terminal '+' or '-' indicating the sense strand. Further, if stranded is True, an alignment is returned only if its strand agrees with the intron's strand. fudge: by how many bases to extend left and right extend sizes to accommodate potential indels score_min: Bowtie2 CONSTANT minimum alignment score gzip_level: compression level to use for temporary files mover: FileMover object, for use in case Bowtie2 idx needs to be pulled from S3 intermediate_dir: where intermediates are stored; for temporarily storing transcript index if it needs to be pulled from S3 scratch: scratch directory for storing temporary files or None if securely created temporary directory No return value. """ bowtie2_index_base_url = Url(bowtie2_index_base) if bowtie2_index_base_url.is_s3: index_basename = os.path.basename(bowtie2_index_base) index_directory = os.path.join(intermediate_dir, 'transcript_index') if not os.path.exists(os.path.join(index_directory, '_STARTED')): # Download index with open(os.path.join(index_directory, '_STARTED'), 'w') \ as started_stream: print >>started_stream, 'STARTED' for extension in ['.1.bt2', '.2.bt2', '.3.bt2', '.4.bt2', '.rev.1.bt2', '.rev.2.bt2']: mover.get(bowtie2_index_base_url, index_directory) with open(os.path.join(index_directory, '_SUCCESS'), 'w') \ as success_stream: print >>success_stream, 'SUCCESS' while not os.path.exists(os.path.join(index_directory, '_SUCCESS')): time.sleep(0.5) bowtie2_index_base = os.path.join(index_directory, index_basename) global _input_line_count temp_dir_path = make_temp_dir(scratch) register_cleanup(tempdel.remove_temporary_directories, [temp_dir_path]) reads_file = os.path.join(temp_dir_path, 'reads.temp.gz') with xopen(True, reads_file, 'w', gzip_level) as reads_stream: for _input_line_count, line in enumerate(input_stream): seq = line.strip() print >>reads_stream, '\t'.join([seq, seq, 'I'*len(seq)]) input_command = 'gzip -cd %s' % reads_file bowtie_command = ' '.join([bowtie2_exe, bowtie2_args if bowtie2_args is not None else '', ' --local -t --no-hd --mm -x', bowtie2_index_base, '--12 -', '--score-min L,%d,0' % score_min, '-D 24 -R 3 -N 1 -L 20 -i L,4,0']) delegate_command = ''.join( [sys.executable, ' ', os.path.realpath(__file__)[:-3], '_delegate.py --report-multiplier %08f --fudge %d %s %s' % (report_multiplier, fudge, '--stranded' if stranded else '', '--verbose' if verbose else '')] ) full_command = ' | '.join([input_command, bowtie_command, delegate_command]) print >>sys.stderr, 'Starting Bowtie2 with command: ' + full_command bowtie_process = subprocess.Popen(' '.join( ['set -exo pipefail;', full_command] ), bufsize=-1, stdout=sys.stdout, stderr=sys.stderr, shell=True, executable='/bin/bash') return_code = bowtie_process.wait() if return_code: raise RuntimeError('Error occurred while reading Bowtie 2 output; ' 'exitlevel was %d.' % return_code)
assert token_count in [3, 5], ( 'Line {} of input has {} fields, but 3 or 5 are expected.' ).format(input_line_count + 1, token_count) file_to_count = tokens[0] if (not ((token_count == 3 and Url(tokens[0]).is_local) or (token_count == 5 and Url(tokens[0]).is_local and Url(tokens[2]).is_local)) or (Url(tokens[0]).is_local and (tokens[0].endswith('.tar.gz') or tokens[0].endswith('.tar.bz2') or tokens[0].endswith('.tar'))) ): sys.stdout.write(line) output_line_count += 1 continue with xopen(None, file_to_count) as input_stream: first_char = input_stream.readline()[0] if first_char in fasta_cues: sys.stdout.write(line) output_line_count += 1 continue elif first_char not in fastq_cues: raise RuntimeError( 'File "{}" is neither a FASTA nor a FASTQ file.'.format( file_to_count ) ) with xopen(None, file_to_count) as input_stream: phred_format, line_count = inferred_phred_format(input_stream) lines_and_bytes = str((int(line_count) + 1) / line_divider) print '\t'.join(
if args.out is not None: '''If --out is a local file, just write directly to that file. Otherwise, write to a temporary file that will later be uploaded to the destination.''' output_url = Url(args.out) if output_url.is_local: try: os.makedirs(output_url.to_url()) except: pass output_filename = os.path.join(args.out, args.junction_filename) else: temp_dir_path = make_temp_dir(tempdel.silentexpandvars(args.scratch)) register_cleanup(tempdel.remove_temporary_directories, [temp_dir_path]) output_filename = args.junction_filename + '.temp' output_filename = os.path.join(temp_dir_path, output_filename) with xopen(True, output_filename, 'w', args.gzip_level) as output_stream: for line in sys.stdin: tokens = line.strip().split('\t') # Remove leading zeros from ints print >>output_stream, '\t'.join( [tokens[0], str(int(tokens[1])), str(int(tokens[2]) - 1), tokens[3], tokens[4]] ) input_line_count += 1 else: # Default --out is stdout for line in sys.stdin: tokens = line.strip().split('\t') # Remove leading zeros from ints print '\t'.join([tokens[0], str(int(tokens[1])), str(int(tokens[2]) - 1), tokens[3],
if args.out is not None: '''If --out is a local file, just write directly to that file. Otherwise, write to a temporary file that will later be uploaded to the destination.''' output_url = Url(args.out) if output_url.is_local: try: os.makedirs(output_url.to_url()) except: pass output_filename = os.path.join(args.out, args.junction_filename) else: temp_dir_path = make_temp_dir(tempdel.silentexpandvars(args.scratch)) register_cleanup(tempdel.remove_temporary_directories, [temp_dir_path]) output_filename = args.junction_filename + '.temp' output_filename = os.path.join(temp_dir_path, output_filename) with xopen(True, output_filename, 'w', args.gzip_level) as output_stream: for line in sys.stdin: counter.add('inputs') tokens = line.strip().split('\t') # Remove leading zeros from ints print >>output_stream, '\t'.join( [tokens[0][:-1], tokens[0][-1], str(int(tokens[1])), str(int(tokens[2]) - 1), tokens[3], tokens[4]] ) input_line_count += 1 else: # Default --out is stdout for line in sys.stdin: counter.add('inputs') tokens = line.strip().split('\t') # Remove leading zeros from ints
def go(input_stream=sys.stdin, output_stream=sys.stdout, bowtie2_exe='bowtie2', bowtie2_index_base='genome', bowtie2_args='', verbose=False, report_multiplier=1.2, stranded=False, fudge=5, score_min=60, gzip_level=3, mover=filemover.FileMover(), intermediate_dir='.', scratch=None): """ Runs Rail-RNA-cointron_enum Alignment script for MapReduce pipelines that wraps Bowtie 2. Finds introns that cooccur on reads by local alignments to transcriptome elements from Bowtie 2. Input (read from stdin) ---------------------------- Tab-delimited output tuple columns (readletize) 1. SEQ or its reversed complement, whichever is first in alphabetical order 2. Comma-separated list of sample labels if field 1 is the read sequence; '\x1c' if empty 3. Comma-separated list of sample labels if field 1 is the reversed complement of the read sequence; '\x1c' if empty Hadoop output (written to stdout) ---------------------------- Tab-delimited tuple columns: 1. Reference name (RNAME in SAM format) + '+' or '-' indicating which strand is the sense strand 2. Comma-separated list of intron start positions in configuration 3. Comma-separated list of intron end positions in configuration 4. left_extend_size: by how many bases on the left side of an intron the reference should extend 5. right_extend_size: by how many bases on the right side of an intron the reference should extend 6. Read sequence input_stream: where to find input reads. output_stream: where to emit exonic chunks and introns. bowtie2_exe: filename of Bowtie 2 executable; include path if not in $PATH. bowtie2_index_base: the basename of the Bowtie index files associated with the reference. bowtie2_args: string containing precisely extra command-line arguments to pass to Bowtie 2, e.g., "--tryhard --best"; or None. verbose: True iff more informative messages should be written to stderr. report_multiplier: if verbose is True, the line number of an alignment written to stderr increases exponentially with base report_multiplier. stranded: True iff input reads are strand-specific; this affects whether an output partition has a terminal '+' or '-' indicating the sense strand. Further, if stranded is True, an alignment is returned only if its strand agrees with the intron's strand. fudge: by how many bases to extend left and right extend sizes to accommodate potential indels score_min: Bowtie2 CONSTANT minimum alignment score gzip_level: compression level to use for temporary files mover: FileMover object, for use in case Bowtie2 idx needs to be pulled from S3 intermediate_dir: where intermediates are stored; for temporarily storing transcript index if it needs to be pulled from S3 scratch: scratch directory for storing temporary files or None if securely created temporary directory No return value. """ bowtie2_index_base_url = Url(bowtie2_index_base) if bowtie2_index_base_url.is_s3: index_basename = os.path.basename(bowtie2_index_base) index_directory = os.path.join(intermediate_dir, 'transcript_index') if not os.path.exists(os.path.join(index_directory, '_STARTED')): # Download index with open(os.path.join(index_directory, '_STARTED'), 'w') \ as started_stream: print >> started_stream, 'STARTED' for extension in [ '.1.bt2', '.2.bt2', '.3.bt2', '.4.bt2', '.rev.1.bt2', '.rev.2.bt2' ]: mover.get(bowtie2_index_base_url, index_directory) with open(os.path.join(index_directory, '_SUCCESS'), 'w') \ as success_stream: print >> success_stream, 'SUCCESS' while not os.path.exists(os.path.join(index_directory, '_SUCCESS')): time.sleep(0.5) bowtie2_index_base = os.path.join(index_directory, index_basename) global _input_line_count temp_dir_path = make_temp_dir(scratch) register_cleanup(tempdel.remove_temporary_directories, [temp_dir_path]) reads_file = os.path.join(temp_dir_path, 'reads.temp.gz') with xopen(True, reads_file, 'w', gzip_level) as reads_stream: for _input_line_count, line in enumerate(input_stream): seq = line.strip() print >> reads_stream, '\t'.join([seq, seq, 'I' * len(seq)]) input_command = 'gzip -cd %s' % reads_file bowtie_command = ' '.join([ bowtie2_exe, bowtie2_args if bowtie2_args is not None else '', ' --local -t --no-hd --mm -x', bowtie2_index_base, '--12 -', '--score-min L,%d,0' % score_min, '-D 24 -R 3 -N 1 -L 20 -i L,4,0' ]) delegate_command = ''.join([ sys.executable, ' ', os.path.realpath(__file__)[:-3], '_delegate.py --report-multiplier %08f --fudge %d %s %s' % (report_multiplier, fudge, '--stranded' if stranded else '', '--verbose' if verbose else '') ]) full_command = ' | '.join( [input_command, bowtie_command, delegate_command]) print >> sys.stderr, 'Starting Bowtie2 with command: ' + full_command bowtie_process = subprocess.Popen(' '.join( ['set -exo pipefail;', full_command]), bufsize=-1, stdout=sys.stdout, stderr=sys.stderr, shell=True, executable='/bin/bash') return_code = bowtie_process.wait() if return_code: raise RuntimeError('Error occurred while reading Bowtie 2 output; ' 'exitlevel was %d.' % return_code)
def input_files_from_input_stream(input_stream, output_stream, temp_dir_path=None, verbose=False, gzip_level=3): """ Generates FASTA reference to index and file with reads. Each line of the read file is in the following format: read number <TAB> SEQ <TAB> QUAL input_stream: where to find Hadoop input output_stream: where to write unmapped reads temp_dir_path: where to store files verbose: output extra debugging messages gzip_level: gzip compression level (0-9) Yield value: tuple (path to FASTA reference file, path to read file) """ global _input_line_count if temp_dir_path is None: temp_dir_path = tempfile.mkdtemp() prefasta_filename = os.path.join(temp_dir_path, 'temp.prefa') deduped_fasta_filename = os.path.join(temp_dir_path, 'temp.deduped.prefa') final_fasta_filename = os.path.join(temp_dir_path, 'temp.fa') reads_filename = os.path.join(temp_dir_path, 'reads.temp.gz') for (counter, ((index_group,), xpartition)) in enumerate( xstream(input_stream, 1) ): if verbose: print >>sys.stderr, ( 'Group %d: Writing prefasta and input reads...' % counter ) with open(prefasta_filename, 'w') as fasta_stream: with xopen(True, reads_filename, 'w') as read_stream: for read_seq, values in itertools.groupby(xpartition, key=lambda val: val[0]): fasta_printed = False for value in values: _input_line_count += 1 if value[1][0] == '0': # Print FASTA line print >>fasta_stream, '\t'.join([value[1][1:-2], value[2]]) fasta_printed = True elif fasta_printed: '''Add to temporary seq stream only if an associated FASTA line was found.''' if value[1] == '1': print >>read_stream, '\t'.join([value[2], read_seq, value[3]]) else: print >>read_stream, '\t'.join([ value[2], read_seq[::-1].translate( _reversed_complement_translation_table ), value[3][::-1]]) else: # Print unmapped read if value[1] == '1': seq_to_write = read_seq qual_to_write = value[3] else: seq_to_write = read_seq[::-1].translate( _reversed_complement_translation_table ) qual_to_write = value[3][::-1] '''Write only essentials; handle "formal" writing in next step.''' output_stream.write( '%s\t4\t\x1c\t\x1c\t\x1c\t\x1c' '\t\x1c\t\x1c\t\x1c\t%s\t%s\n' % ( value[2], seq_to_write, qual_to_write ) ) if verbose: print >>sys.stderr, ( 'Group %d: Done! Sorting and deduplicating prefasta...' % counter ) # Sort prefasta and eliminate duplicate lines dedup_process_return = subprocess.call( r'''sort %s | uniq >%s''' % (prefasta_filename, deduped_fasta_filename), shell=True, executable='/bin/bash' ) if dedup_process_return != 0: raise RuntimeError( 'Problem encountered deduplicating FASTA reference' ) if verbose: print >>sys.stderr, ( 'Group %d Done! Writing final FASTA.' % counter ) with open(final_fasta_filename, 'w') as final_fasta_stream: with open(deduped_fasta_filename) as fasta_stream: for line in fasta_stream: rname, seq = line.strip().split('\t') print >>final_fasta_stream, rname final_fasta_stream.write( '\n'.join([seq[i:i+80] for i in xrange(0, len(seq), 80)]) ) final_fasta_stream.write('\n') os.remove(deduped_fasta_filename) os.remove(prefasta_filename) yield final_fasta_filename, reads_filename
def go(input_stream=sys.stdin, output_stream=sys.stdout, bowtie2_exe='bowtie2', bowtie_index_base='genome', bowtie2_index_base='genome2', manifest_file='manifest', bowtie2_args=None, bin_size=10000, verbose=False, exon_differentials=True, exon_intervals=False, report_multiplier=1.2, min_exon_size=8, search_filter=1, min_readlet_size=15, max_readlet_size=25, readlet_interval=12, capping_multiplier=1.5, drop_deletions=False, gzip_level=3, scratch=None, index_count=1, output_bam_by_chr=False, tie_margin=0, no_realign=False, no_polyA=False): """ Runs Rail-RNA-align_reads. A single pass of Bowtie is run to find end-to-end alignments. Unmapped reads are saved for readletizing to determine junctions in sucessive reduce steps as well as for realignment in a later map step. Input (read from stdin) ---------------------------- Tab-delimited input tuple columns in a mix of any of the following three formats: Format 1 (single-end, 3-column): 1. Nucleotide sequence or its reversed complement, whichever is first in alphabetical order 2. 1 if sequence was reverse-complemented else 0 3. Name 4. Quality sequence or its reverse, whichever corresponds to field 1 Format 2 (paired, 2 lines, 3 columns each) (so this is the same as single-end) 1. Nucleotide sequence for mate 1 or its reversed complement, whichever is first in alphabetical order 2. 1 if sequence was reverse-complemented else 0 3. Name for mate 1 4. Quality sequence for mate 1 or its reverse, whichever corresponds to field 1 (new line) 1. Nucleotide sequence for mate 2 or its reversed complement, whichever is first in alphabetical order 2. 1 if sequence was reverse complemented else 0 3. Name for mate 2 4. Quality sequence for mate 2 or its reverse, whichever corresponds to field 1 Input is partitioned and sorted by field 1, the read sequence. Hadoop output (written to stdout) ---------------------------- A given RNAME sequence is partitioned into intervals ("bins") of some user-specified length (see partition.py). Exonic chunks (aka ECs; three formats, any or all of which may be emitted): Format 1 (exon_ival); tab-delimited output tuple columns: 1. Reference name (RNAME in SAM format) + ';' + bin number 2. Sample index 3. EC start (inclusive) on forward strand 4. EC end (exclusive) on forward strand Format 2 (exon_diff); tab-delimited output tuple columns: 1. Reference name (RNAME in SAM format) + ';' + bin number 2. max(EC start, bin start) (inclusive) on forward strand IFF diff is positive and EC end (exclusive) on forward strand IFF diff is negative 3. Sample index 4. '1' if alignment from which diff originates is "unique" according to --tie-margin criterion; else '0' 5. +1 or -1 * count, the number of instances of a read sequence for which to print exonic chunks Note that only unique alignments are currently output as ivals and/or diffs. Format 3 (sam); tab-delimited output tuple columns: Standard SAM output except fields are in different order, and the first field corresponds to sample label. (Fields are reordered to facilitate partitioning by sample name/RNAME and sorting by POS.) Each line corresponds to a spliced alignment. The order of the fields is as follows. 1. Sample index if outputting BAMs by sample OR sample-rname index if outputting BAMs by chr 2. (Number string representing RNAME; see BowtieIndexReference class in bowtie_index for conversion information) OR '0' if outputting BAMs by chr 3. POS 4. QNAME 5. FLAG 6. MAPQ 7. CIGAR 8. RNEXT 9. PNEXT 10. TLEN 11. SEQ 12. QUAL ... + optional fields Insertions/deletions (indel_bed) tab-delimited output tuple columns: 1. 'I' or 'D' insertion or deletion line 2. Number string representing RNAME 3. Start position (Last base before insertion or first base of deletion) 4. End position (Last base before insertion or last base of deletion (exclusive)) 5. Inserted sequence for insertions or deleted sequence for deletions 6. Sample index ----Next fields are for junctions only; they are '\x1c' for indels---- 7. '\x1c' 8. '\x1c' -------------------------------------------------------------------- 9. Number of instances of insertion or deletion in sample; this is always +1 * count before bed_pre combiner/reducer Read whose primary alignment is not end-to-end Tab-delimited output tuple columns (unmapped): 1. Transcriptome Bowtie 2 index group number 2. SEQ 3. 1 if SEQ is reverse-complemented, else 0 4. QNAME 5. QUAL Tab-delimited output tuple columns (readletized): 1. Readlet sequence or its reversed complement, whichever is first in alphabetical order 2. read sequence ID + ('-' if readlet sequence is reverse-complemented; else '+') + '\x1e' + displacement of readlet's 5' end from read's 5' end + '\x1e' + displacement of readlet's 3' end from read's 3' end (+, for EXACTLY one readlet of a read sequence, '\x1e' + read sequence + '\x1e' + (an '\x1f'-separated list A of unique sample labels with read sequences that match the original read sequence) + '\x1e' + (an '\x1f'-separated list of unique sample labels B with read sequences that match the reversed complement of the original read sequence)) + '\x1e' + (an '\x1f'-separated list of the number of instances of the read sequence for each respective sample in list A) + '\x1e' + (an '\x1f'-separated list of the number of instances of the read sequence's reversed complement for each respective sample in list B). Here, a read sequence ID takes the form X:Y, where X is the "mapred_task_partition" environment variable -- a unique index for a task within a job -- and Y is the index of the read sequence relative to the beginning of the input stream. Tab-delimited tuple columns (postponed_sam): Standard 11+ -column raw SAM output Single column (unique): 1. A unique read sequence Two columns, exactly one line (dummy); ensures creation of junction index: 1. character "-" 2. the word "dummy" ALL OUTPUT COORDINATES ARE 1-INDEXED. input_stream: where to find input reads. output_stream: where to emit exonic chunks and junctions. bowtie2_exe: filename of Bowtie2 executable; include path if not in $PATH. bowtie_index_base: the basename of the Bowtie1 index files associated with the reference. bowtie2_index_base: the basename of the Bowtie2 index files associated with the reference. manifest_file: filename of manifest bowtie2_args: string containing precisely extra command-line arguments to pass to first-pass Bowtie2. bin_size: genome is partitioned in units of bin_size for later load balancing. verbose: True iff more informative messages should be written to stderr. exon_differentials: True iff EC differentials are to be emitted. exon_intervals: True iff EC intervals are to be emitted. report_multiplier: if verbose is True, the line number of an alignment or read written to stderr increases exponentially with base report_multiplier. min_exon_size: minimum exon size searched for in junction_search.py later in pipeline; used to determine how large a soft clip on one side of a read is necessary to pass it on to junction search pipeline search_filter: how large a soft clip on one side of a read is necessary to pass it on to junction search pipeline min_readlet_size: "capping" readlets (that is, readlets that terminate at a given end of the read) are never smaller than this value max_readlet_size: size of every noncapping readlet readlet_interval: number of bases separating successive readlets along the read capping_multiplier: successive capping readlets on a given end of a read are increased in size exponentially with base capping_multiplier drop_deletions: True iff deletions should be dropped from coverage vector gzip_level: compression level to use for temporary files scratch: scratch directory for storing temporary files or None if securely created temporary directory index_count: number of transcriptome Bowtie 2 indexes to which to assign unmapped reads for later realignment output_bam_by_chr: True iff final output BAMs will be by chromosome tie_margin: allowed score difference per 100 bases among ties in max score. For example, 150 and 144 are tied alignment scores for a 100-bp read when --tie-margin is 6. no_realign: True iff job flow does not need more than readlets: this usually means only a transcript index is being constructed no_polyA: kill noncapping readlets that are all As and write as unmapped all reads with polyA prefixes whose suffixes are < min_exon_size No return value. """ global _input_line_count reference_index = bowtie_index.BowtieIndexReference(bowtie_index_base) manifest_object = manifest.LabelsAndIndices(manifest_file) alignment_printer = AlignmentPrinter( manifest_object, reference_index, bin_size=bin_size, output_stream=output_stream, exon_ivals=exon_intervals, exon_diffs=exon_differentials, drop_deletions=drop_deletions, output_bam_by_chr=output_bam_by_chr, tie_margin=tie_margin ) # Get task partition to pass to align_reads_delegate.py try: task_partition = os.environ['mapred_task_partition'] except KeyError: # Hadoop 2.x? try: task_partition = os.environ['mapreduce_task_partition'] except KeyError: # A unit test is probably being run task_partition = '0' temp_dir = make_temp_dir(scratch) register_cleanup(tempdel.remove_temporary_directories, [temp_dir]) align_file = os.path.join(temp_dir, 'first_pass_reads.temp.gz') other_reads_file = os.path.join(temp_dir, 'other_reads.temp.gz') second_pass_file = os.path.join(temp_dir, 'second_pass_reads.temp.gz') k_value, _, _ = bowtie.parsed_bowtie_args(bowtie2_args) nothing_doing = True # Required length of prefix after poly(A) is trimmed remaining_seq_size = max(min_exon_size - 1, 1) with xopen(True, align_file, 'w', gzip_level) as align_stream, \ xopen(True, other_reads_file, 'w', gzip_level) as other_stream: for seq_number, ((seq,), xpartition) in enumerate( xstream(sys.stdin, 1) ): seq_length = len(seq) if no_polyA and ( all(seq[i] == 'A' for i in xrange(seq_length - remaining_seq_size)) or all(seq[i] == 'T' for i in xrange(remaining_seq_size, seq_length)) or all(seq[i] == 'A' for i in xrange(remaining_seq_size, seq_length)) or all(seq[i] == 'T' for i in xrange(seq_length - remaining_seq_size)) ): if not no_realign: '''If a sequence is too short without its poly(A) tail, make all reads with that sequence unmapped. Technically, this also kills poly(A)s at 5' ends, but we probably couldn't align those sequences anyway.''' reversed_complement_seq = seq[::-1].translate( _reversed_complement_translation_table ) for is_reversed, name, qual in xpartition: if is_reversed == '0': alignment_printer.print_unmapped_read( name, seq, qual ) else: alignment_printer.print_unmapped_read( name, reversed_complement_seq, qual[::-1] ) continue nothing_doing = False '''Select highest-quality read with alphabetically last qname for first-pass alignment.''' best_name, best_mean_qual, best_qual_index, i = None, None, 0, 0 others_to_print = dlist() for is_reversed, name, qual in xpartition: _input_line_count += 1 others_to_print.append( '\t'.join([ str(seq_number), is_reversed, name, qual ]) ) mean_qual = ( float(sum([ord(score) for score in qual])) / len(qual) ) if (mean_qual > best_mean_qual or mean_qual == best_mean_qual and name > best_name): best_qual_index = i best_mean_qual = mean_qual best_name = name to_align = '\t'.join([ '%s\x1d%s' % (is_reversed, name), seq, qual ]) i += 1 assert i >= 1 if i == 1: print >>other_stream, str(seq_number) else: for j, other_to_print in enumerate(others_to_print): if j != best_qual_index: print >>other_stream, other_to_print print >>align_stream, to_align # Print dummy line print 'dummy\t-\tdummy' sys.stdout.flush() # this is REALLY important b/c called script will stdout if nothing_doing: # No input sys.exit(0) input_command = 'gzip -cd %s' % align_file bowtie_command = ' '.join([bowtie2_exe, bowtie2_args if bowtie2_args is not None else '', ' --sam-no-qname-trunc --local -t --no-hd --mm -x', bowtie2_index_base, '--12 -']) delegate_command = ''.join( [sys.executable, ' ', os.path.realpath(__file__)[:-3], ('_delegate.py --task-partition {task_partition} ' '--other-reads {other_reads} --second-pass-reads ' '{second_pass_reads} --min-readlet-size ' '{min_readlet_size} {drop_deletions} ' '--max-readlet-size {max_readlet_size} ' '--readlet-interval {readlet_interval} ' '--capping-multiplier {capping_multiplier:1.12f} ' '{verbose} --report-multiplier {report_multiplier:1.12f} ' '--k-value {k_value} ' '--bowtie-idx {bowtie_index_base} ' '--partition-length {bin_size} ' '--manifest {manifest_file} ' '{exon_differentials} {exon_intervals} ' '--gzip-level {gzip_level} ' '--search-filter {search_filter} ' '--index-count {index_count} ' '--tie-margin {tie_margin} ' '{no_realign} ' '{no_polyA} ' '{output_bam_by_chr}').format( task_partition=task_partition, other_reads=other_reads_file, second_pass_reads=second_pass_file, min_readlet_size=min_readlet_size, drop_deletions=('--drop-deletions' if drop_deletions else ''), max_readlet_size=max_readlet_size, readlet_interval=readlet_interval, capping_multiplier=capping_multiplier, verbose=('--verbose' if verbose else ''), report_multiplier=report_multiplier, k_value=k_value, bowtie_index_base=bowtie_index_base, bin_size=bin_size, manifest_file=manifest_file, exon_differentials=('--exon-differentials' if exon_differentials else ''), exon_intervals=('--exon-intervals' if exon_intervals else ''), gzip_level=gzip_level, search_filter=search_filter, index_count=index_count, tie_margin=tie_margin, no_realign=('--no-realign' if no_realign else ''), no_polyA=('--no-polyA' if no_polyA else ''), output_bam_by_chr=('--output-bam-by-chr' if output_bam_by_chr else '') )] ) full_command = ' | '.join([input_command, bowtie_command, delegate_command]) print >>sys.stderr, \ 'Starting first-pass Bowtie 2 with command: ' + full_command bowtie_process = subprocess.Popen(' '.join( ['set -exo pipefail;', full_command] ), bufsize=-1, stdout=sys.stdout, stderr=sys.stderr, shell=True, executable='/bin/bash') return_code = bowtie_process.wait() if return_code: raise RuntimeError('Error occurred while reading first-pass Bowtie 2 ' 'output; exitlevel was %d.' % return_code) os.remove(align_file) os.remove(other_reads_file) if not no_realign: input_command = 'gzip -cd %s' % second_pass_file bowtie_command = ' '.join([bowtie2_exe, bowtie2_args if bowtie2_args is not None else '', ' --sam-no-qname-trunc --local -t --no-hd --mm -x', bowtie2_index_base, '--12 -']) delegate_command = ''.join( [sys.executable, ' ', os.path.realpath(__file__)[:-3], ('_delegate.py --task-partition {task_partition} ' '--min-readlet-size {min_readlet_size} ' '{drop_deletions} ' '--max-readlet-size {max_readlet_size} ' '--readlet-interval {readlet_interval} ' '--capping-multiplier {capping_multiplier:012f} ' '{verbose} ' '--report-multiplier {report_multiplier:012f} ' '--k-value {k_value} ' '--bowtie-idx {bowtie_index_base} ' '--partition-length {bin_size} ' '--manifest {manifest_file} ' '{exon_differentials} {exon_intervals} ' '--gzip-level {gzip_level} ' '--search-filter {search_filter} ' '--index-count {index_count} ' '--tie-margin {tie_margin} ' '{output_bam_by_chr}').format( task_partition=task_partition, min_readlet_size=min_readlet_size, drop_deletions=('--drop-deletions' if drop_deletions else ''), readlet_interval=readlet_interval, max_readlet_size=max_readlet_size, capping_multiplier=capping_multiplier, verbose=('--verbose' if verbose else ''), report_multiplier=report_multiplier, k_value=k_value, bowtie_index_base=bowtie_index_base, bin_size=bin_size, manifest_file=manifest_file, exon_differentials=('--exon-differentials' if exon_differentials else ''), exon_intervals=('--exon-intervals' if exon_intervals else ''), gzip_level=gzip_level, search_filter=search_filter, index_count=index_count, tie_margin=tie_margin, output_bam_by_chr=('--output-bam-by-chr' if output_bam_by_chr else '') )] ) full_command = ' | '.join([input_command, bowtie_command, delegate_command]) print >>sys.stderr, \ 'Starting second-pass Bowtie 2 with command: ' + full_command bowtie_process = subprocess.Popen(' '.join( ['set -exo pipefail;', full_command] ), bufsize=-1, stdout=sys.stdout, stderr=sys.stderr, shell=True, executable='/bin/bash') return_code = bowtie_process.wait() if return_code: raise RuntimeError('Error occurred while reading second-pass ' 'Bowtie 2 output; exitlevel was %d.' % return_code) sys.stdout.flush()
register_cleanup(tempdel.remove_temporary_directories, [temp_dir_path]) input_line_count = 0 for (line_type,), xpartition in xstream(sys.stdin, 1): type_string = ('insertions' if line_type == '0' else ('deletions' if line_type == '1' else ('junctions' if line_type == '2' else 'normalization'))) output_filename = ((args.tsv_basename + '.' if args.tsv_basename != '' else '') + type_string + '.tsv.gz') if output_url.is_local: output_path = os.path.join(args.out, output_filename) else: output_path = os.path.join(temp_dir_path, output_filename) with xopen(True, output_path, 'w', args.gzip_level) as output_stream: if line_type != '3': '''Print all labels in the order in which they appear in the manifest file.''' sample_count = len(manifest_object.index_to_label) for i in xrange(sample_count): output_stream.write( '\t' + manifest_object.index_to_label[str(i)] ) output_stream.write('\n') for coverage_line in xpartition: input_line_count += 1 (rname, pos, end_pos, strand_or_seq) = coverage_line[:4] '''Handle missing zeros at end of line here; in previous step, the total number of samples was unknown, so this was not done.'''
def input_files_from_input_stream(input_stream, output_stream, temp_dir_path=None, verbose=False, gzip_level=3): """ Generates FASTA reference to index and file with reads. Each line of the read file is in the following format: read number <TAB> SEQ <TAB> QUAL input_stream: where to find Hadoop input output_stream: where to write unmapped reads temp_dir_path: where to store files verbose: output extra debugging messages gzip_level: gzip compression level (0-9) Yield value: tuple (path to FASTA reference file, path to read file) """ global _input_line_count if temp_dir_path is None: temp_dir_path = tempfile.mkdtemp() prefasta_filename = os.path.join(temp_dir_path, 'temp.prefa') deduped_fasta_filename = os.path.join(temp_dir_path, 'temp.deduped.prefa') final_fasta_filename = os.path.join(temp_dir_path, 'temp.fa') reads_filename = os.path.join(temp_dir_path, 'reads.temp.gz') for (group_counter, ((index_group, ), xpartition)) in enumerate(xstream(input_stream, 1)): counter.add('partitions') if verbose: print >> sys.stderr, ( 'Group %d: Writing prefasta and input reads...' % group_counter) with open(prefasta_filename, 'w') as fasta_stream: with xopen(True, reads_filename, 'w') as read_stream: for read_seq, values in itertools.groupby( xpartition, key=lambda val: val[0]): fasta_printed = False counter.add('inputs') for value in values: _input_line_count += 1 if value[1][0] == '0': # Print FASTA line print >> fasta_stream, '\t'.join( [value[1][1:-2], value[2]]) fasta_printed = True elif fasta_printed: '''Add to temporary seq stream only if an associated FASTA line was found.''' if value[1] == '1': print >> read_stream, '\t'.join( [value[2], read_seq, value[3]]) else: print >> read_stream, '\t'.join([ value[2], read_seq[::-1].translate( _reversed_complement_translation_table ), value[3][::-1] ]) else: # Print unmapped read if value[1] == '1': seq_to_write = read_seq qual_to_write = value[3] else: seq_to_write = read_seq[::-1].translate( _reversed_complement_translation_table) qual_to_write = value[3][::-1] '''Write only essentials; handle "formal" writing in next step.''' output_stream.write( '%s\t4\t\x1c\t\x1c\t\x1c\t\x1c' '\t\x1c\t\x1c\t\x1c\t%s\t%s\n' % (value[2], seq_to_write, qual_to_write)) if verbose: print >> sys.stderr, ( 'Group %d: Done! Sorting and deduplicating prefasta...' % group_counter) # Sort prefasta and eliminate duplicate lines dedup_process_return = subprocess.call( r'''sort %s | uniq >%s''' % (prefasta_filename, deduped_fasta_filename), shell=True, executable='/bin/bash') if dedup_process_return != 0: raise RuntimeError( 'Problem encountered deduplicating FASTA reference') if verbose: print >> sys.stderr, ('Group %d Done! Writing final FASTA.' % group_counter) with open(final_fasta_filename, 'w') as final_fasta_stream: with open(deduped_fasta_filename) as fasta_stream: for line in fasta_stream: rname, seq = line.strip().split('\t') print >> final_fasta_stream, rname final_fasta_stream.write('\n'.join( [seq[i:i + 80] for i in xrange(0, len(seq), 80)])) final_fasta_stream.write('\n') os.remove(deduped_fasta_filename) os.remove(prefasta_filename) output_stream.flush() yield final_fasta_filename, reads_filename
def go(input_stream=sys.stdin, output_stream=sys.stdout, bowtie2_exe='bowtie2', bowtie_index_base='genome', bowtie2_index_base='genome2', manifest_file='manifest', bowtie2_args=None, bin_size=10000, verbose=False, exon_differentials=True, exon_intervals=False, report_multiplier=1.2, min_exon_size=8, search_filter=1, min_readlet_size=15, max_readlet_size=25, readlet_interval=12, capping_multiplier=1.5, drop_deletions=False, gzip_level=3, scratch=None, index_count=1, output_bam_by_chr=False, tie_margin=0, no_realign=False, no_polyA=False): """ Runs Rail-RNA-align_reads. A single pass of Bowtie is run to find end-to-end alignments. Unmapped reads are saved for readletizing to determine introns in sucessive reduce steps as well as for realignment in a later map step. Input (read from stdin) ---------------------------- Tab-delimited input tuple columns in a mix of any of the following three formats: Format 1 (single-end, 3-column): 1. Nucleotide sequence or its reversed complement, whichever is first in alphabetical order 2. 1 if sequence was reverse-complemented else 0 3. Name 4. Quality sequence or its reverse, whichever corresponds to field 1 Format 2 (paired, 2 lines, 3 columns each) (so this is the same as single-end) 1. Nucleotide sequence for mate 1 or its reversed complement, whichever is first in alphabetical order 2. 1 if sequence was reverse-complemented else 0 3. Name for mate 1 4. Quality sequence for mate 1 or its reverse, whichever corresponds to field 1 (new line) 1. Nucleotide sequence for mate 2 or its reversed complement, whichever is first in alphabetical order 2. 1 if sequence was reverse complemented else 0 3. Name for mate 2 4. Quality sequence for mate 2 or its reverse, whichever corresponds to field 1 Input is partitioned and sorted by field 1, the read sequence. Hadoop output (written to stdout) ---------------------------- A given RNAME sequence is partitioned into intervals ("bins") of some user-specified length (see partition.py). Exonic chunks (aka ECs; three formats, any or all of which may be emitted): Format 1 (exon_ival); tab-delimited output tuple columns: 1. Reference name (RNAME in SAM format) + ';' + bin number 2. Sample index 3. EC start (inclusive) on forward strand 4. EC end (exclusive) on forward strand Format 2 (exon_diff); tab-delimited output tuple columns: 1. Reference name (RNAME in SAM format) + ';' + bin number 2. max(EC start, bin start) (inclusive) on forward strand IFF diff is positive and EC end (exclusive) on forward strand IFF diff is negative 3. Sample index 4. '1' if alignment from which diff originates is "unique" according to --tie-margin criterion; else '0' 5. +1 or -1 * count, the number of instances of a read sequence for which to print exonic chunks Note that only unique alignments are currently output as ivals and/or diffs. Format 3 (sam); tab-delimited output tuple columns: Standard SAM output except fields are in different order, and the first field corresponds to sample label. (Fields are reordered to facilitate partitioning by sample name/RNAME and sorting by POS.) Each line corresponds to a spliced alignment. The order of the fields is as follows. 1. Sample index if outputting BAMs by sample OR sample-rname index if outputting BAMs by chr 2. (Number string representing RNAME; see BowtieIndexReference class in bowtie_index for conversion information) OR '0' if outputting BAMs by chr 3. POS 4. QNAME 5. FLAG 6. MAPQ 7. CIGAR 8. RNEXT 9. PNEXT 10. TLEN 11. SEQ 12. QUAL ... + optional fields Insertions/deletions (indel_bed) tab-delimited output tuple columns: 1. 'I' or 'D' insertion or deletion line 2. Number string representing RNAME 3. Start position (Last base before insertion or first base of deletion) 4. End position (Last base before insertion or last base of deletion (exclusive)) 5. Inserted sequence for insertions or deleted sequence for deletions 6. Sample index ----Next fields are for introns only; they are '\x1c' for indels---- 7. '\x1c' 8. '\x1c' -------------------------------------------------------------------- 9. Number of instances of insertion or deletion in sample; this is always +1 * count before bed_pre combiner/reducer Read whose primary alignment is not end-to-end Tab-delimited output tuple columns (unmapped): 1. Transcriptome Bowtie 2 index group number 2. SEQ 3. 1 if SEQ is reverse-complemented, else 0 4. QNAME 5. QUAL Tab-delimited output tuple columns (readletized): 1. Readlet sequence or its reversed complement, whichever is first in alphabetical order 2. read sequence ID + ('-' if readlet sequence is reverse-complemented; else '+') + '\x1e' + displacement of readlet's 5' end from read's 5' end + '\x1e' + displacement of readlet's 3' end from read's 3' end (+, for EXACTLY one readlet of a read sequence, '\x1e' + read sequence + '\x1e' + (an '\x1f'-separated list A of unique sample labels with read sequences that match the original read sequence) + '\x1e' + (an '\x1f'-separated list of unique sample labels B with read sequences that match the reversed complement of the original read sequence)) + '\x1e' + (an '\x1f'-separated list of the number of instances of the read sequence for each respective sample in list A) + '\x1e' + (an '\x1f'-separated list of the number of instances of the read sequence's reversed complement for each respective sample in list B). Here, a read sequence ID takes the form X:Y, where X is the "mapred_task_partition" environment variable -- a unique index for a task within a job -- and Y is the index of the read sequence relative to the beginning of the input stream. Tab-delimited tuple columns (postponed_sam): Standard 11+ -column raw SAM output Single column (unique): 1. A unique read sequence Two columns, exactly one line (dummy); ensures creation of intron index: 1. character "-" 2. the word "dummy" ALL OUTPUT COORDINATES ARE 1-INDEXED. input_stream: where to find input reads. output_stream: where to emit exonic chunks and introns. bowtie2_exe: filename of Bowtie2 executable; include path if not in $PATH. bowtie_index_base: the basename of the Bowtie1 index files associated with the reference. bowtie2_index_base: the basename of the Bowtie2 index files associated with the reference. manifest_file: filename of manifest bowtie2_args: string containing precisely extra command-line arguments to pass to first-pass Bowtie2. bin_size: genome is partitioned in units of bin_size for later load balancing. verbose: True iff more informative messages should be written to stderr. exon_differentials: True iff EC differentials are to be emitted. exon_intervals: True iff EC intervals are to be emitted. report_multiplier: if verbose is True, the line number of an alignment or read written to stderr increases exponentially with base report_multiplier. min_exon_size: minimum exon size searched for in intron_search.py later in pipeline; used to determine how large a soft clip on one side of a read is necessary to pass it on to intron search pipeline search_filter: how large a soft clip on one side of a read is necessary to pass it on to intron search pipeline min_readlet_size: "capping" readlets (that is, readlets that terminate at a given end of the read) are never smaller than this value max_readlet_size: size of every noncapping readlet readlet_interval: number of bases separating successive readlets along the read capping_multiplier: successive capping readlets on a given end of a read are increased in size exponentially with base capping_multiplier drop_deletions: True iff deletions should be dropped from coverage vector gzip_level: compression level to use for temporary files scratch: scratch directory for storing temporary files or None if securely created temporary directory index_count: number of transcriptome Bowtie 2 indexes to which to assign unmapped reads for later realignment output_bam_by_chr: True iff final output BAMs will be by chromosome tie_margin: allowed score difference per 100 bases among ties in max score. For example, 150 and 144 are tied alignment scores for a 100-bp read when --tie-margin is 6. no_realign: True iff job flow does not need more than readlets: this usually means only a transcript index is being constructed no_polyA: kill noncapping readlets that are all As and write as unmapped all reads with polyA prefixes whose suffixes are < min_exon_size No return value. """ global _input_line_count # Required length of prefix after poly(A) is trimmed remaining_seq_size = max(min_exon_size - 1, 1) polyA_set = frozenset(['A' * i for i in xrange(1, remaining_seq_size + 1)] + ['T' * i for i in xrange(1, remaining_seq_size + 1)] + ['']) reference_index = bowtie_index.BowtieIndexReference(bowtie_index_base) manifest_object = manifest.LabelsAndIndices(manifest_file) alignment_printer = AlignmentPrinter(manifest_object, reference_index, bin_size=bin_size, output_stream=output_stream, exon_ivals=exon_intervals, exon_diffs=exon_differentials, drop_deletions=drop_deletions, output_bam_by_chr=output_bam_by_chr, tie_margin=tie_margin) # Get task partition to pass to align_reads_delegate.py try: task_partition = os.environ['mapred_task_partition'] except KeyError: # Hadoop 2.x? try: task_partition = os.environ['mapreduce_task_partition'] except KeyError: # A unit test is probably being run task_partition = '0' temp_dir = make_temp_dir(scratch) register_cleanup(tempdel.remove_temporary_directories, [temp_dir]) align_file = os.path.join(temp_dir, 'first_pass_reads.temp.gz') other_reads_file = os.path.join(temp_dir, 'other_reads.temp.gz') second_pass_file = os.path.join(temp_dir, 'second_pass_reads.temp.gz') k_value, _, _ = bowtie.parsed_bowtie_args(bowtie2_args) nothing_doing = True with xopen(True, align_file, 'w', gzip_level) as align_stream, \ xopen(True, other_reads_file, 'w', gzip_level) as other_stream: for seq_number, ((seq, ), xpartition) in enumerate(xstream(sys.stdin, 1)): if no_polyA and (seq[:-remaining_seq_size] in polyA_set or seq[remaining_seq_size:] in polyA_set): if not no_realign: '''If a sequence is too short without its poly(A) tail, make all reads with that sequence unmapped. Technically, this also kills poly(A)s at 5' ends, but we probably couldn't align those sequences anyway.''' reversed_complement_seq = seq[::-1].translate( _reversed_complement_translation_table) for is_reversed, name, qual in xpartition: if is_reversed == '0': alignment_printer.print_unmapped_read( name, seq, qual) else: alignment_printer.print_unmapped_read( name, reversed_complement_seq, qual[::-1]) continue nothing_doing = False '''Select highest-quality read with alphabetically last qname for first-pass alignment.''' best_name, best_mean_qual, best_qual_index, i = None, None, 0, 0 others_to_print = dlist() for is_reversed, name, qual in xpartition: _input_line_count += 1 others_to_print.append('\t'.join( [str(seq_number), is_reversed, name, qual])) mean_qual = (float(sum([ord(score) for score in qual])) / len(qual)) if (mean_qual > best_mean_qual or mean_qual == best_mean_qual and name > best_name): best_qual_index = i best_mean_qual = mean_qual best_name = name to_align = '\t'.join( ['%s\x1d%s' % (is_reversed, name), seq, qual]) i += 1 assert i >= 1 if i == 1: print >> other_stream, str(seq_number) else: for j, other_to_print in enumerate(others_to_print): if j != best_qual_index: print >> other_stream, other_to_print print >> align_stream, to_align # Print dummy line print 'dummy\t-\tdummy' sys.stdout.flush( ) # this is REALLY important b/c called script will stdout if nothing_doing: # No input sys.exit(0) input_command = 'gzip -cd %s' % align_file bowtie_command = ' '.join([ bowtie2_exe, bowtie2_args if bowtie2_args is not None else '', ' --sam-no-qname-trunc --local -t --no-hd --mm -x', bowtie2_index_base, '--12 -' ]) delegate_command = ''.join([ sys.executable, ' ', os.path.realpath(__file__)[:-3], ('_delegate.py --task-partition {task_partition} ' '--other-reads {other_reads} --second-pass-reads ' '{second_pass_reads} --min-readlet-size ' '{min_readlet_size} {drop_deletions} ' '--max-readlet-size {max_readlet_size} ' '--readlet-interval {readlet_interval} ' '--capping-multiplier {capping_multiplier:1.12f} ' '{verbose} --report-multiplier {report_multiplier:1.12f} ' '--k-value {k_value} ' '--bowtie-idx {bowtie_index_base} ' '--partition-length {bin_size} ' '--manifest {manifest_file} ' '{exon_differentials} {exon_intervals} ' '--gzip-level {gzip_level} ' '--search-filter {search_filter} ' '--index-count {index_count} ' '--tie-margin {tie_margin} ' '{no_realign} ' '{no_polyA} ' '{output_bam_by_chr}').format( task_partition=task_partition, other_reads=other_reads_file, second_pass_reads=second_pass_file, min_readlet_size=min_readlet_size, drop_deletions=('--drop-deletions' if drop_deletions else ''), max_readlet_size=max_readlet_size, readlet_interval=readlet_interval, capping_multiplier=capping_multiplier, verbose=('--verbose' if verbose else ''), report_multiplier=report_multiplier, k_value=k_value, bowtie_index_base=bowtie_index_base, bin_size=bin_size, manifest_file=manifest_file, exon_differentials=('--exon-differentials' if exon_differentials else ''), exon_intervals=('--exon-intervals' if exon_intervals else ''), gzip_level=gzip_level, search_filter=search_filter, index_count=index_count, tie_margin=tie_margin, no_realign=('--no-realign' if no_realign else ''), no_polyA=('--no-polyA' if no_polyA else ''), output_bam_by_chr=('--output-bam-by-chr' if output_bam_by_chr else '')) ]) full_command = ' | '.join( [input_command, bowtie_command, delegate_command]) print >>sys.stderr, \ 'Starting first-pass Bowtie 2 with command: ' + full_command bowtie_process = subprocess.Popen(' '.join( ['set -exo pipefail;', full_command]), bufsize=-1, stdout=sys.stdout, stderr=sys.stderr, shell=True, executable='/bin/bash') return_code = bowtie_process.wait() if return_code: raise RuntimeError('Error occurred while reading first-pass Bowtie 2 ' 'output; exitlevel was %d.' % return_code) os.remove(align_file) os.remove(other_reads_file) if not no_realign: input_command = 'gzip -cd %s' % second_pass_file bowtie_command = ' '.join([ bowtie2_exe, bowtie2_args if bowtie2_args is not None else '', ' --sam-no-qname-trunc --local -t --no-hd --mm -x', bowtie2_index_base, '--12 -' ]) delegate_command = ''.join([ sys.executable, ' ', os.path.realpath(__file__)[:-3], ('_delegate.py --task-partition {task_partition} ' '--min-readlet-size {min_readlet_size} ' '{drop_deletions} ' '--max-readlet-size {max_readlet_size} ' '--readlet-interval {readlet_interval} ' '--capping-multiplier {capping_multiplier:012f} ' '{verbose} ' '--report-multiplier {report_multiplier:012f} ' '--k-value {k_value} ' '--bowtie-idx {bowtie_index_base} ' '--partition-length {bin_size} ' '--manifest {manifest_file} ' '{exon_differentials} {exon_intervals} ' '--gzip-level {gzip_level} ' '--search-filter {search_filter} ' '--index-count {index_count} ' '--tie-margin {tie_margin} ' '{output_bam_by_chr}').format( task_partition=task_partition, min_readlet_size=min_readlet_size, drop_deletions=('--drop-deletions' if drop_deletions else ''), readlet_interval=readlet_interval, max_readlet_size=max_readlet_size, capping_multiplier=capping_multiplier, verbose=('--verbose' if verbose else ''), report_multiplier=report_multiplier, k_value=k_value, bowtie_index_base=bowtie_index_base, bin_size=bin_size, manifest_file=manifest_file, exon_differentials=('--exon-differentials' if exon_differentials else ''), exon_intervals=('--exon-intervals' if exon_intervals else ''), gzip_level=gzip_level, search_filter=search_filter, index_count=index_count, tie_margin=tie_margin, output_bam_by_chr=('--output-bam-by-chr' if output_bam_by_chr else '')) ]) full_command = ' | '.join( [input_command, bowtie_command, delegate_command]) print >>sys.stderr, \ 'Starting second-pass Bowtie 2 with command: ' + full_command bowtie_process = subprocess.Popen(' '.join( ['set -exo pipefail;', full_command]), bufsize=-1, stdout=sys.stdout, stderr=sys.stderr, shell=True, executable='/bin/bash') return_code = bowtie_process.wait() if return_code: raise RuntimeError('Error occurred while reading second-pass ' 'Bowtie 2 output; exitlevel was %d.' % return_code) sys.stdout.flush()
output_filename = ( (args.tsv_basename + '.' if args.tsv_basename != '' else '') + 'counts.tsv.gz') if output_url.is_local: output_path = os.path.join(args.out, output_filename) else: output_path = os.path.join(temp_dir_path, output_filename) input_line_count = 0 # Get RNAMEs in order of descending length sorted_rnames = [ reference_index.string_to_rname['%012d' % i] for i in xrange(len(reference_index.string_to_rname)) ] sample_indexes_seen = set() with xopen(True, output_path, 'w', args.gzip_level) as output_stream: print >> output_stream, '\t'.join([''] + sorted_rnames + ['total mapped reads', 'total reads']) for (_, sample_index), xpartition in xstream(sys.stdin, 2): sample_label = manifest_object.index_to_label[sample_index] total_counts, unique_counts = defaultdict(int), defaultdict(int) for rname_index, total_count, unique_count in xpartition: rname = reference_index.string_to_rname[rname_index] total_counts[rname] = int(total_count) unique_counts[rname] = int(unique_count) total_reads = sum(total_counts.values()) total_mapped_reads = total_reads - total_counts['*'] total_uniques = sum(unique_counts.values()) total_mapped_uniques = total_uniques - unique_counts['*'] print >> output_stream, '\t'.join([sample_label] + [ '%d,%d' % (total_counts[rname], unique_counts[rname])
if (list_size % 2): return sorted_list[index] return (sorted_list[index] + sorted_list[index + 1]) / 2.0 library_size = args.library_size * 1000000 start_time = time.time() input_line_count, output_line_count = 0, 0 bin_count = 0 # For converting RNAMEs to number strings reference_index = bowtie_index.BowtieIndexReference( os.path.expandvars(args.bowtie_idx)) manifest_object = manifest.LabelsAndIndices(os.path.expandvars(args.manifest)) # Grab read counts mapped_read_counts, unique_mapped_read_counts = {}, {} with xopen(None, args.read_counts) as read_count_stream: read_count_stream.readline() for line in read_count_stream: tokens = line.strip().split('\t') sample_index = manifest_object.label_to_index[tokens[0]] (mapped_read_counts[sample_index], unique_mapped_read_counts[sample_index]) = [ int(token) for token in tokens[-2].split(',') ] try: mean_weight = 1. / len([_ for _ in mapped_read_counts.values() if _]) except ZeroDivisionError: mean_weight = 0.0 try: unique_mean_weight = 1. / len( [_ for _ in unique_mapped_read_counts.values() if _])
read_id, _, read_rest = read.partition('\x1e') if len(read_rest.split('\x1e')) > 2: print >>output_stream, \ '%s\t%s\t\x1c\t\x1c\t\x1c' % (read_id[:-1], read_rest) output_line_count += 1 read = qname_stream.readline().strip() output_stream.flush() print >>sys.stderr, ('align_readlets_delegate.py reports %d output lines.' % output_line_count) if __name__ == '__main__': import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('--verbose', action='store_const', const=True, default=False, help='Print out extra debugging statements') parser.add_argument('--qnames-file', type=str, required=True, help=('Where to find extended QNAMEs storing read sequence IDs to ' 'which readlets belong and other pertinent information')) parser.add_argument('--report-multiplier', type=float, required=False, default=1.2, help='When --verbose is also invoked, the only lines of lengthy ' 'intermediate output written to stderr have line number that ' 'increases exponentially with this base') args = parser.parse_args() with xopen(None, args.qnames_file) as qname_stream: go(qname_stream, verbose=args.verbose, report_multiplier=args.report_multiplier)
def go(input_stream=sys.stdin, output_stream=sys.stdout, bowtie_exe='bowtie', bowtie_index_base='genome', bowtie_args='', gzip_level=3, verbose=False, report_multiplier=1.2, scratch=None): """ Runs Rail-RNA-align_readlets. Aligns input readlet sequences and writes a single output line per readlet belonging to a distinct read sequence. Input (read from stdin) ---------------------------- Tab-delimited input tuple columns: 1. Readlet sequence or its reversed complement, whichever is first in alphabetical order 2. read sequence ID + ('-' if readlet sequence is reverse-complemented; else '+') + '\x1e' + displacement of readlet's 5' end from read's 5' end + '\x1e' + displacement of readlet's 3' end from read's 3' end (+, for EXACTLY one readlet of a read sequence, '\x1e' + read sequence + '\x1e' + (an '\x1f'-separated list A of unique sample labels with read sequences that match the original read sequence) + '\x1e' + (an '\x1f'-separated list of unique sample labels B with read sequences that match the reversed complement of the original read sequence)) + '\x1e' + (an '\x1f'-separated list of the number of instances of the read sequence for each respective sample in list A) + '\x1e' + (an '\x1f'-separated list of the number of instances of the read sequence's reversed complement for each respective sample in list B). Here, a read sequence ID takes the form X:Y, where X is the "mapred_task_partition" environment variable -- a unique index for a task within a job -- and Y is the index of the read sequence relative to the beginning of the input stream. Input is partitioned by field 1, the readlet sequence or its reversed complement. Hadoop output (written to stdout) ---------------------------- Tab-delimited output tuple columns, where each line corresponds to a readlet from a distinct read rather than a unique readlet sequence: 1. Read sequence ID 2. Displacement of readlet's 5' end from read's 5' end + '\x1e' + displacement of readlet's 3' end from read's 3' end (+, for EXACTLY one readlet of a read sequence, '\x1e' + read sequence + '\x1e' + number of instances of read sequence + '\x1e' + number of instances of read sequence's reversed complement + '\x1e' (+, for EXACTLY one readlet of a read sequence, '\x1e' + read sequence + '\x1e' + (an '\x1f'-separated list A of unique sample labels with read sequences that match the original read sequence) + '\x1e' + (an '\x1f'-separated list of unique sample labels B with read sequences that match the reversed complement of the original read sequence))] + '\x1e' + (an '\x1f'-separated list of the number of instances of the read sequence for each respective sample in list A) + '\x1e' + (an '\x1f'-separated list of the number of instances of the read sequence's reversed complement for each respective sample in list B) 3. '\x1f'-separated list of alignment RNAMEs or '\x1c' if no alignments 4. '\x1f'-separated list of alignment FLAGs or '\x1c' if no alignments 5. '\x1f-separated list of alignment POSes or '\x1c' if no alignments ALL OUTPUT COORDINATES ARE 1-INDEXED. input_stream: where to find input reads. output_stream: where to emit exonic chunks and introns. bowtie_exe: filename of Bowtie executable; include path if not in $PATH. bowtie_index_base: the basename of the Bowtie index files associated with the reference. bowtie_args: string containing precisely extra command-line arguments to pass to first-pass Bowtie, e.g., "--tryhard --best"; or None. gzip_level: level of gzip compression to use for qname file verbose: True iff more informative messages should be written to stderr. report_multiplier: if verbose is True, the line number of an alignment written to stderr increases exponentially with base report_multiplier. scratch: scratch directory for storing temporary files or None if securely created temporary directory No return value. """ global _input_line_count # For storing long qnames temp_dir = make_temp_dir(scratch) register_cleanup(tempdel.remove_temporary_directories, [temp_dir]) qnames_file = os.path.join(temp_dir, 'qnames.temp.gz') readlet_file = os.path.join(temp_dir, 'readlets.temp.gz') with xopen(True, qnames_file, 'w', gzip_level) as qname_stream: with xopen(True, readlet_file, 'w', gzip_level) as readlet_stream: for (seq_count, ((seq,), xpartition)) \ in enumerate(xstream(input_stream, 1)): print >>readlet_stream, \ '\t'.join([str(seq_count), seq, 'I'*len(seq)]) print >> qname_stream, next(iter(xpartition))[0] for (qname, ) in xpartition: _input_line_count += 1 print >> qname_stream, qname # Separate qnames with single + character print >> qname_stream, '+' input_command = 'gzip -cd %s' % readlet_file bowtie_command = ' '.join([ bowtie_exe, bowtie_args, '-S -t --sam-nohead --mm', bowtie_index_base, '--12 -' ]) delegate_command = ''.join([ sys.executable, ' ', os.path.realpath(__file__)[:-3], '_delegate.py --report-multiplier %08f --qnames-file %s %s' % (report_multiplier, qnames_file, '--verbose' if verbose else '') ]) full_command = ' | '.join( [input_command, bowtie_command, delegate_command]) print >> sys.stderr, 'Starting Bowtie with command: ' + full_command bowtie_process = subprocess.Popen(' '.join( ['set -exo pipefail;', full_command]), bufsize=-1, stdout=sys.stdout, stderr=sys.stderr, shell=True, executable='/bin/bash') return_code = bowtie_process.wait() if return_code: raise RuntimeError('Error occurred while reading Bowtie output; ' 'exitlevel was %d.' % return_code)
def go( input_stream=sys.stdin, output_stream=sys.stdout, bowtie_exe="bowtie", bowtie_index_base="genome", bowtie_args="", gzip_level=3, verbose=False, report_multiplier=1.2, scratch=None, ): """ Runs Rail-RNA-align_readlets. Aligns input readlet sequences and writes a single output line per readlet belonging to a distinct read sequence. Input (read from stdin) ---------------------------- Tab-delimited input tuple columns: 1. Readlet sequence or its reversed complement, whichever is first in alphabetical order 2. read sequence ID + ('-' if readlet sequence is reverse-complemented; else '+') + '\x1e' + displacement of readlet's 5' end from read's 5' end + '\x1e' + displacement of readlet's 3' end from read's 3' end (+, for EXACTLY one readlet of a read sequence, '\x1e' + read sequence + '\x1e' + (an '\x1f'-separated list A of unique sample labels with read sequences that match the original read sequence) + '\x1e' + (an '\x1f'-separated list of unique sample labels B with read sequences that match the reversed complement of the original read sequence)) + '\x1e' + (an '\x1f'-separated list of the number of instances of the read sequence for each respective sample in list A) + '\x1e' + (an '\x1f'-separated list of the number of instances of the read sequence's reversed complement for each respective sample in list B). Here, a read sequence ID takes the form X:Y, where X is the "mapred_task_partition" environment variable -- a unique index for a task within a job -- and Y is the index of the read sequence relative to the beginning of the input stream. Input is partitioned by field 1, the readlet sequence or its reversed complement. Hadoop output (written to stdout) ---------------------------- Tab-delimited output tuple columns, where each line corresponds to a readlet from a distinct read rather than a unique readlet sequence: 1. Read sequence ID 2. Displacement of readlet's 5' end from read's 5' end + '\x1e' + displacement of readlet's 3' end from read's 3' end (+, for EXACTLY one readlet of a read sequence, '\x1e' + read sequence + '\x1e' + number of instances of read sequence + '\x1e' + number of instances of read sequence's reversed complement + '\x1e' (+, for EXACTLY one readlet of a read sequence, '\x1e' + read sequence + '\x1e' + (an '\x1f'-separated list A of unique sample labels with read sequences that match the original read sequence) + '\x1e' + (an '\x1f'-separated list of unique sample labels B with read sequences that match the reversed complement of the original read sequence))] + '\x1e' + (an '\x1f'-separated list of the number of instances of the read sequence for each respective sample in list A) + '\x1e' + (an '\x1f'-separated list of the number of instances of the read sequence's reversed complement for each respective sample in list B) 3. '\x1f'-separated list of alignment RNAMEs or '\x1c' if no alignments 4. '\x1f'-separated list of alignment FLAGs or '\x1c' if no alignments 5. '\x1f-separated list of alignment POSes or '\x1c' if no alignments ALL OUTPUT COORDINATES ARE 1-INDEXED. input_stream: where to find input reads. output_stream: where to emit exonic chunks and introns. bowtie_exe: filename of Bowtie executable; include path if not in $PATH. bowtie_index_base: the basename of the Bowtie index files associated with the reference. bowtie_args: string containing precisely extra command-line arguments to pass to first-pass Bowtie, e.g., "--tryhard --best"; or None. gzip_level: level of gzip compression to use for qname file verbose: True iff more informative messages should be written to stderr. report_multiplier: if verbose is True, the line number of an alignment written to stderr increases exponentially with base report_multiplier. scratch: scratch directory for storing temporary files or None if securely created temporary directory No return value. """ global _input_line_count # For storing long qnames temp_dir = make_temp_dir(scratch) register_cleanup(tempdel.remove_temporary_directories, [temp_dir]) qnames_file = os.path.join(temp_dir, "qnames.temp.gz") readlet_file = os.path.join(temp_dir, "readlets.temp.gz") with xopen(True, qnames_file, "w", gzip_level) as qname_stream: with xopen(True, readlet_file, "w", gzip_level) as readlet_stream: for (seq_count, ((seq,), xpartition)) in enumerate(xstream(input_stream, 1)): print >> readlet_stream, "\t".join([str(seq_count), seq, "I" * len(seq)]) print >> qname_stream, next(iter(xpartition))[0] for (qname,) in xpartition: _input_line_count += 1 print >> qname_stream, qname # Separate qnames with single + character print >> qname_stream, "+" input_command = "gzip -cd %s" % readlet_file bowtie_command = " ".join([bowtie_exe, bowtie_args, "-S -t --sam-nohead --mm", bowtie_index_base, "--12 -"]) delegate_command = "".join( [ sys.executable, " ", os.path.realpath(__file__)[:-3], "_delegate.py --report-multiplier %08f --qnames-file %s %s" % (report_multiplier, qnames_file, "--verbose" if verbose else ""), ] ) full_command = " | ".join([input_command, bowtie_command, delegate_command]) print >>sys.stderr, "Starting Bowtie with command: " + full_command bowtie_process = subprocess.Popen( " ".join(["set -exo pipefail;", full_command]), bufsize=-1, stdout=sys.stdout, stderr=sys.stderr, shell=True, executable="/bin/bash", ) return_code = bowtie_process.wait() if return_code: raise RuntimeError("Error occurred while reading Bowtie output; " "exitlevel was %d." % return_code)
continue except IndexError: continue token_count = len(tokens) assert token_count in [ 3, 5 ], ('Line {} of input has {} fields, but 3 or 5 are expected.').format( input_line_count + 1, token_count) file_to_count = tokens[0] if (not ((token_count == 3 and Url(tokens[0]).is_local) or (token_count == 5 and Url(tokens[0]).is_local and Url(tokens[2]).is_local))): sys.stdout.write(line) output_line_count += 1 continue with xopen(None, file_to_count) as input_stream: first_char = input_stream.readline()[0] if first_char in fastq_cues: # 4 lines per record line_divider = 4 elif first_char in fasta_cues: line_divider = 2 else: raise RuntimeError( 'File "{}" is neither a FASTA nor a FASTQ file.'.format( file_to_count)) with xopen(None, file_to_count) as input_stream: phred_format, line_count = inferred_phred_format(input_stream) lines_and_bytes = str((int(line_count) + 1) / line_divider) print '\t'.join([ '#!splitload', lines_and_bytes,