示例#1
0
    help='Basename for index to be written')
parser.add_argument(\
    '--keep-alive', action='store_const', const=True, default=False,
    help='Prints reporter:status:alive messages to stderr to keep EMR '
         'task alive')

filemover.add_args(parser)
bowtie.add_args(parser)
tempdel.add_args(parser)
args = parser.parse_args()

import time
start_time = time.time()

output_filename, output_stream, output_url = [None] * 3
output_url = Url(args.out) if args.out is not None \
    else Url(os.getcwd())
# Set up temporary destination
import tempfile
temp_dir_path = make_temp_dir(tempdel.silentexpandvars(args.scratch))
# For deleting temporary directory, even on unexpected exit
register_cleanup(tempdel.remove_temporary_directories, [temp_dir_path])
# Set up temporary destination
try:
    os.makedirs(os.path.join(temp_dir_path, 'index'))
except:
    pass
# Write to temporary directory, and later upload to URL
index_basename = os.path.join(temp_dir_path, 'index/' + args.basename)
fasta_file = os.path.join(temp_dir_path, 'temp.fa')
print >> sys.stderr, 'Opened %s for writing....' % fasta_file
示例#2
0
formed; reference_index.rname_lengths[RNAME] is the length of RNAME.'''
reference_index = bowtie_index.BowtieIndexReference(
    os.path.expandvars(args.bowtie_idx))
# For mapping sample indices back to original sample labels
manifest_object = manifest.LabelsAndIndices(os.path.expandvars(args.manifest))
# Create file with chromosome sizes for bedTobigwig
sizes_filename = os.path.join(temp_dir_path, 'chrom.sizes')
if args.verbose:
    print >> sys.stderr, 'Sizes file: %s .' % sizes_filename
with open(sizes_filename, 'w') as sizes_stream:
    for rname in reference_index.rname_lengths:
        print >> sizes_stream, '%s %d' % (rname,
                                          reference_index.rname_lengths[rname])

input_line_count, output_line_count = 0, 0
output_url = Url(args.out)
if output_url.is_local:
    # Set up destination directory
    try:
        os.makedirs(output_url.to_url())
    except:
        pass
mover = filemover.FileMover(args=args)
track_line = ('track type=bedGraph name="{name}" '
              'description="{description}" visibility=full '
              'color=227,29,118 altColor=0,179,220 priority=400')
for (sample_index, ), xpartition in xstream(sys.stdin, 1):
    try:
        sample_label = manifest_object.index_to_label[sample_index]
    except KeyError:
        # It's a mean or median
示例#3
0
def go(input_stream=sys.stdin,
       output_stream=sys.stdout,
       bowtie2_exe='bowtie2',
       bowtie2_index_base='genome',
       bowtie2_args='',
       verbose=False,
       report_multiplier=1.2,
       stranded=False,
       fudge=5,
       score_min=60,
       gzip_level=3,
       mover=filemover.FileMover(),
       intermediate_dir='.',
       scratch=None):
    """ Runs Rail-RNA-cointron_enum 

        Alignment script for MapReduce pipelines that wraps Bowtie 2. Finds
        introns that cooccur on reads by local alignments to transcriptome
        elements from Bowtie 2.

        Input (read from stdin)
        ----------------------------
        Tab-delimited output tuple columns (readletize)
        1. SEQ or its reversed complement, whichever is first in alphabetical
            order
        2. Comma-separated list of sample labels if field 1 is the read
            sequence; '\x1c' if empty
        3. Comma-separated list of sample labels if field 1 is the reversed
            complement of the read sequence; '\x1c' if empty

        Hadoop output (written to stdout)
        ----------------------------
        Tab-delimited tuple columns:
        1. Reference name (RNAME in SAM format) + 
            '+' or '-' indicating which strand is the sense strand
        2. Comma-separated list of intron start positions in configuration
        3. Comma-separated list of intron end positions in configuration
        4. left_extend_size: by how many bases on the left side of an intron
            the reference should extend
        5. right_extend_size: by how many bases on the right side of an intron
            the reference should extend
        6. Read sequence

        input_stream: where to find input reads.
        output_stream: where to emit exonic chunks and introns.
        bowtie2_exe: filename of Bowtie 2 executable; include path if not in
            $PATH.
        bowtie2_index_base: the basename of the Bowtie index files associated
            with the reference.
        bowtie2_args: string containing precisely extra command-line arguments
            to pass to Bowtie 2, e.g., "--tryhard --best"; or None.
        verbose: True iff more informative messages should be written to
            stderr.
        report_multiplier: if verbose is True, the line number of an alignment
            written to stderr increases exponentially with base
            report_multiplier.
        stranded: True iff input reads are strand-specific; this affects
            whether an output partition has a terminal '+' or '-' indicating
            the sense strand. Further, if stranded is True, an alignment is
            returned only if its strand agrees with the intron's strand.
        fudge: by how many bases to extend left and right extend sizes
                to accommodate potential indels
        score_min: Bowtie2 CONSTANT minimum alignment score
        gzip_level: compression level to use for temporary files
        mover: FileMover object, for use in case Bowtie2 idx needs to be
            pulled from S3
        intermediate_dir: where intermediates are stored; for temporarily
            storing transcript index if it needs to be pulled from S3
        scratch: scratch directory for storing temporary files or None if 
            securely created temporary directory

        No return value.
    """
    bowtie2_index_base_url = Url(bowtie2_index_base)
    if bowtie2_index_base_url.is_s3:
        index_basename = os.path.basename(bowtie2_index_base)
        index_directory = os.path.join(intermediate_dir, 'transcript_index')
        if not os.path.exists(os.path.join(index_directory, '_STARTED')):
            # Download index
            with open(os.path.join(index_directory, '_STARTED'), 'w') \
                as started_stream:
                print >> started_stream, 'STARTED'
            for extension in [
                    '.1.bt2', '.2.bt2', '.3.bt2', '.4.bt2', '.rev.1.bt2',
                    '.rev.2.bt2'
            ]:
                mover.get(bowtie2_index_base_url, index_directory)
            with open(os.path.join(index_directory, '_SUCCESS'), 'w') \
                as success_stream:
                print >> success_stream, 'SUCCESS'
        while not os.path.exists(os.path.join(index_directory, '_SUCCESS')):
            time.sleep(0.5)
        bowtie2_index_base = os.path.join(index_directory, index_basename)
    global _input_line_count
    temp_dir_path = make_temp_dir(scratch)
    register_cleanup(tempdel.remove_temporary_directories, [temp_dir_path])
    reads_file = os.path.join(temp_dir_path, 'reads.temp.gz')
    with xopen(True, reads_file, 'w', gzip_level) as reads_stream:
        for _input_line_count, line in enumerate(input_stream):
            seq = line.strip()
            print >> reads_stream, '\t'.join([seq, seq, 'I' * len(seq)])
    input_command = 'gzip -cd %s' % reads_file
    bowtie_command = ' '.join([
        bowtie2_exe, bowtie2_args if bowtie2_args is not None else '',
        ' --local -t --no-hd --mm -x', bowtie2_index_base, '--12 -',
        '--score-min L,%d,0' % score_min, '-D 24 -R 3 -N 1 -L 20 -i L,4,0'
    ])
    delegate_command = ''.join([
        sys.executable, ' ',
        os.path.realpath(__file__)[:-3],
        '_delegate.py --report-multiplier %08f --fudge %d %s %s' %
        (report_multiplier, fudge, '--stranded' if stranded else '',
         '--verbose' if verbose else '')
    ])
    full_command = ' | '.join(
        [input_command, bowtie_command, delegate_command])
    print >> sys.stderr, 'Starting Bowtie2 with command: ' + full_command
    bowtie_process = subprocess.Popen(' '.join(
        ['set -exo pipefail;', full_command]),
                                      bufsize=-1,
                                      stdout=sys.stdout,
                                      stderr=sys.stderr,
                                      shell=True,
                                      executable='/bin/bash')
    return_code = bowtie_process.wait()
    if return_code:
        raise RuntimeError('Error occurred while reading Bowtie 2 output; '
                           'exitlevel was %d.' % return_code)
示例#4
0
for input_line_count, line in enumerate(sys.stdin):
    # Kill offset from start of manifest file
    tokens = line.strip().split('\t')[1:]
    try:
        stripped = tokens[0].strip()
        if stripped[0] == '#' or not line.strip():
            continue
    except IndexError:
        continue
    token_count = len(tokens)
    assert token_count in [
        3, 5
    ], ('Line {} of input has {} fields, but 3 or 5 are expected.').format(
        input_line_count + 1, token_count)
    file_to_count = tokens[0]
    if (not ((token_count == 3 and Url(tokens[0]).is_local) or
             (token_count == 5 and Url(tokens[0]).is_local
              and Url(tokens[2]).is_local))):
        sys.stdout.write(line)
        output_line_count += 1
        continue
    with xopen(None, file_to_count) as input_stream:
        first_char = input_stream.readline()[0]
        if first_char in fastq_cues:
            # 4 lines per record
            line_divider = 4
        elif first_char in fasta_cues:
            line_divider = 2
        else:
            raise RuntimeError(
                'File "{}" is neither a FASTA nor a FASTQ file.'.format(