示例#1
0
def go(nucleotides_per_input=8000000, gzip_output=True, gzip_level=3,
        to_stdout=False, push='.', mover=filemover.FileMover(),
        verbose=False, scratch=None, bin_qualities=True, short_qnames=False,
        skip_bad_records=False, workspace_dir=None,
        fastq_dump_exe='fastq-dump', ignore_missing_sra_samples=False):
    """ Runs Rail-RNA-preprocess

        Input (read from stdin)
        ----------------------------
        Tab-separated fields:
        ---If URL is local:
        1. #!splitload
        2. \x1d-separated list of 0-based indexes of reads at which to start
            each new file
        3. \x1d-separated list of numbers of reads to include in gzipped files
        4. \x1d-separated list of manifest lines whose tabs are replaced by
            \x1es

        ---Otherwise:
        manifest line

        A manifest line has the following format

        (for single-end reads)
        <URL>(tab)<Optional MD5>(tab)<Sample label>

        (for paired-end reads)
        <URL 1>(tab)<Optional MD5 1>(tab)<URL 2>(tab)<Optional MD5 2>(tab)
        <Sample label>

        Hadoop output (written to stdout)
        ----------------------------
        None.

        Other output (written to directory specified by command-line parameter
            --push)
        ____________________________
        Files containing input data in one of the following formats:

        Format 1 (single-end, 3-column):
          1. Nucleotide sequence or its reversed complement, whichever is first
            in alphabetical order
          2. 1 if sequence was reverse-complemented else 0
          3. Name
          4. Quality sequence or its reverse, whichever corresponds to field 1

        Format 2 (paired, 2 lines, 3 columns each)
        (so this is the same as single-end)
          1. Nucleotide sequence for mate 1 or its reversed complement,
            whichever is first in alphabetical order
          2. 1 if sequence was reverse-complemented else 0
          3. Name for mate 1
          4. Quality sequence for mate 1 or its reverse, whichever corresponds
            to field 1
            
            (new line)

          1. Nucleotide sequence for mate 2 or its reversed complement,
            whichever is first in alphabetical order
          2. 1 if sequence was reverse complemented else 0
          3. Name for mate 2
          4. Quality sequence for mate 2 or its reverse, whichever corresponds
            to field 1

        Quality sequences are strings of Is for FASTA input.

        nucleotides_per_input: maximum number of nucleotides to put in a given
            input file
        gzip_output: True iff preprocessed input should be gzipped
        gzip_level: level of gzip compression to use
        push: where to send output
        verbose: True iff extra debugging statements should be printed to
            stderr
        scratch: scratch directory for storing temporary files or None if 
            securely created temporary directory
        bin_qualities: True iff quality string should be binned according to
            rules in _mismatch_penalties_to_quality_scores
            and round_quality_string() defined in go()
        short_qnames: True iff original qname should be killed and a new qname
            should be written in a short base64-encoded format
        skip_bad_records: True iff bad records should be skipped; otherwise,
            raises exception if bad record is encountered
        workspace_dir: where to use fastq-dump -- needed for working with
            dbGaP data. None if temporary dir should be used.
        fastq_dump_exe: path to fastq-dump executable
        ignore_missing_sra_samples: does not return error if fastq-dump doesn't
            find a sample

        No return value
    """
    if bin_qualities:
        import math
        def round_quality_string(qual):
            """ Bins phred+33 quality string to improve compression.

                Uses 5-bin scheme that does not affect Bowtie 2 alignments

                qual: quality string

                Return value: "binned" quality string.
            """
            return ''.join(
                [str(int(
                    _MN + math.floor((_MX - _MN) * min(
                                                    ord(qual_char) - 33.0, 40.0
                                                ) / 40.0)
                        )) for qual_char in qual]).translate(
                                _mismatch_penalties_to_quality_scores
                            )
    else:
        def round_quality_string(qual):
            """ Leaves quality string unbinned and untouched.

                qual: quality string

                Return value: qual
            """
            return qual
    global _input_line_count, _output_line_count
    skip_stubs = False
    temp_dir = make_temp_dir(scratch)
    print >>sys.stderr, 'Created local destination directory "%s".' % temp_dir
    register_cleanup(tempdel.remove_temporary_directories, [temp_dir])
    input_line_count, output_line_count = 0, 0
    if not to_stdout:
        push_url = Url(push)
        if push_url.is_local:
            destination = push
        elif push_url.is_s3 or push_url.is_hdfs or push_url.is_nfs:
            destination = temp_dir
        else:
            raise RuntimeError('Push destination must be '
                               'on S3, HDFS, NFS, or local.')
    fastq_cues = set(['@'])
    fasta_cues = set(['>', ';'])
    source_dict = {}
    onward = False
    for line in sys.stdin:
        _input_line_count += 1
        if not line.strip(): continue
        # Kill offset from start of manifest file
        try:
            tokens = line.strip().split('\t')[1:]
            if tokens[0][0] == '#' and tokens[0] != '#!splitload':
                # Comment line
                continue
        except IndexError:
            # Be robust to bad lines
            continue
        token_count = len(tokens)
        qual_getter = None
        if tokens[0] == '#!splitload':
            '''Line specifies precisely how records from files should be
            placed.'''
            assert not to_stdout, ('Split manifest line inconsistent with '
                                   'writing to stdout.')
            qual_getter = phred_converter(phred_format=tokens[-1])
            indexes = tokens[1].split('\x1d')
            read_counts = tokens[2].split('\x1d')
            manifest_lines = [token.split('\x1e')
                                for token in tokens[3].split('\x1d')]
            assert len(indexes) == len(read_counts) == len(manifest_lines)
            for i, manifest_line in enumerate(manifest_lines):
                manifest_line_field_count = len(manifest_line)
                if manifest_line_field_count == 3:
                    source_dict[(Url(manifest_line[0]),)] = (
                            manifest_line[-1],
                            int(indexes[i]),
                            int(read_counts[i])
                        )
                else:
                    assert manifest_line_field_count == 5
                    source_dict[(Url(manifest_line[0]),
                                 Url(manifest_line[2]))] = (
                                                        manifest_line[-1],
                                                        int(indexes[i]),
                                                        int(read_counts[i])
                                                    )
        elif token_count == 3:
            # SRA or single-end reads
            source_dict[(Url(tokens[0]),)] = (tokens[-1],)
        elif token_count == 5:
            # Paired-end reads
            source_dict[(Url(tokens[0]), Url(tokens[2]))] = (tokens[-1],)
        else:
            # Not a valid line, but continue for robustness
            continue
    file_number = 0
    for source_urls in source_dict:
        sample_label = source_dict[source_urls][0]
        downloaded = set()
        sources = []
        records_printed = 0
        if len(source_dict[source_urls]) == 3:
            skip_count = source_dict[source_urls][1]
            if len(source_urls) == 2:
                records_to_consume = source_dict[source_urls][2]
                if skip_count % 2:
                    skip_count -= 1
                    records_to_consume += 1
                if records_to_consume % 2:
                    records_to_consume -= 1
                # Index reads according to order in input to shorten read names
                read_index = skip_count / 2 # Index reads in pairs
            else:
                records_to_consume = source_dict[source_urls][2]
                read_index = skip_count
        else:
            skip_count = 0
            records_to_consume = None # Consume all records
            read_index = 0
        assert (records_to_consume >= 0 or records_to_consume is None), (
                'Negative value %d of records to consume encountered.'
            ) % records_to_consume
        if records_to_consume == 0: continue
        skipped = False
        for source_url in source_urls:
            if not source_url.is_local:
                # Download
                print >>sys.stderr, 'Retrieving URL "%s"...' \
                    % source_url.to_url()
                if source_url.is_dbgap:
                    download_dir = workspace_dir
                elif source_url.is_sra:
                    download_dir = temp_dir
                if source_url.is_sra:
                    sra_accession = source_url.to_url()
                    fastq_dump_command = (
                            'set -exo pipefail; cd {download_dir}; '
                            '{fastq_dump_exe} -I -X 10000 --split-files '
                            '{sra_accession}'
                        ).format(download_dir=download_dir,
                                    fastq_dump_exe=fastq_dump_exe,
                                    sra_accession=sra_accession)
                    try:
                        subprocess.check_call(
                            fastq_dump_command, shell=True, 
                            executable='/bin/bash',
                            stdout=sys.stderr
                        )
                    except subprocess.CalledProcessError as e:
                        if e.returncode == 3 and ignore_missing_sra_samples:
                            onward = True
                            break
                        else:
                            raise RuntimeError(
                                ('Error "%s" encountered executing '
                                 'command "%s".') % (e.output,
                                                        fastq_dump_command))
                    import glob
                    sra_fastq_files = sorted(
                                        glob.glob(os.path.join(download_dir,
                                            '%s[_.]*' % sra_accession))
                                        ) # ensure 1 before 2 if paired-end
                    # Schedule for deletion
                    def silent_remove(filename):
                        try:
                            os.remove(filename)
                        except OSError as e:
                            pass
                    for sra_fastq_file in sra_fastq_files:
                        register_cleanup(silent_remove, sra_fastq_file)
                    sra_file_count = len(sra_fastq_files)
                    check_for_paired = False
                    if sra_file_count == 1:
                        sra_paired_end = False
                        print >>sys.stderr, 'Detected single-end SRA sample.'
                    elif sra_file_count in [2, 3]:
                        print >>sys.stderr, ('2 or 3 FASTQ files detected. '
                                             'Checking for barcodes...')
                        check_for_paired = True
                    else:
                        raise RuntimeError(
                                ('Unexpected number of files "%d" output '
                                 'by fastq-dump command "%s".')
                                    % (sra_file_count, fastq_dump_command)
                            )
                    if check_for_paired:
                        # Get max/min read lengths from FASTQ
                        with open(
                                    sra_fastq_files[sra_file_count - 2]
                                ) as fastq_stream:
                            max_len, min_len = (
                                    max_min_read_lengths_from_fastq_stream(
                                        fastq_stream
                                    )
                                )
                            print >>sys.stderr, (
                                    'Max/min read length found in candidate '
                                    'barcode FASTQ was {}/{}.'
                                ).format(max_len, min_len)
                            if max_len <= _max_stubby_read_length:
                                print >>sys.stderr, (
                                        'Assumed barcode FASTQ.'
                                    )
                                skip_stubs = True
                                if sra_file_count == 2:
                                    sra_paired_end = False
                                else:
                                    sra_paired_end = True
                            else:
                                if sra_file_count == 2:
                                    sra_paired_end = True
                                else:
                                    raise RuntimeError(
                                        '3 FASTQs detected, but one of them '
                                        'was not recognized as containing '
                                        'barcodes.'
                                    )
                    # Guess quality from first 10k lines
                    with xopen(None, sra_fastq_files[0]) as source_stream:
                        qual_getter = phred_converter(
                                            fastq_stream=source_stream
                                        )
                    for sra_fastq_file in sra_fastq_files:
                        os.remove(sra_fastq_file)
                    sources.append(os.devnull)
                    fastq_dump_command = (
                            'set -exo pipefail; cd {download_dir}; '
                            '{fastq_dump_exe} --split-spot -I --stdout '
                            '{sra_accession}'
                        ).format(download_dir=download_dir,
                                    fastq_dump_exe=fastq_dump_exe,
                                    sra_accession=sra_accession)
                    if skip_stubs:
                        fastq_dump_command += (
                                ' | awk \'BEGIN {{OFS = "\\n"}} '
                                '{{header = $0; '
                                'getline seq; getline qheader; getline qseq; '
                                'if (length(seq) > {min_len}) {{print header, '
                                'seq, qheader, qseq}}}}\''
                            ).format(min_len=_max_stubby_read_length)
                    print >>sys.stderr, fastq_dump_command
                    sra_process = subprocess.Popen(fastq_dump_command,
                                                    shell=True,
                                                    executable='/bin/bash',
                                                    stdout=subprocess.PIPE,
                                                    bufsize=-1)
                else:
                    mover.get(source_url, temp_dir)
                    downloaded = list(
                            set(os.listdir(temp_dir)).difference(downloaded)
                        )
                    sources.append(os.path.join(temp_dir, list(downloaded)[0]))
            else:
                sources.append(source_url.to_url())
        if onward: continue
        '''Use os.devnull so single- and paired-end data can be handled in one
        loop.'''
        if len(sources) == 1:
            sources.append(os.devnull)
        if qual_getter is None:
            # Figure out Phred format
            with xopen(None, sources[0]) as source_stream:
                qual_getter = phred_converter(fastq_stream=source_stream)
        with xopen(None, sources[0]) as source_stream_1, xopen(
                None, sources[1]
            ) as source_stream_2:
            source_streams = [source_stream_1, source_stream_2]
            reorganize = all([source == os.devnull for source in sources])
            if reorganize:
                # SRA data is live
                if sra_paired_end:
                    source_streams = [sra_process.stdout, sra_process.stdout]
                else:
                    source_streams = [sra_process.stdout, open(os.devnull)]
            break_outer_loop = False
            while True:
                if not to_stdout:
                    '''Name files using Hadoop task environment property
                    mapred.task.partition.'''
                    if gzip_output:
                        try:
                            output_file = os.path.join(
                                    destination, 
                                    '.'.join([
                                        os.environ['mapred_task_partition'],
                                        str(file_number), 'gz'
                                    ])
                                )
                        except KeyError:
                            '''Hadoop 2.x: mapreduce.task.partition; see 
                            http://hadoop.apache.org/docs/r2.0.3-alpha/
                            hadoop-project-dist/hadoop-common/
                            DeprecatedProperties.html.'''
                            output_file = os.path.join(
                                    destination, 
                                    '.'.join([
                                        os.environ['mapreduce_task_partition'],
                                        str(file_number), 'gz'
                                    ])
                                )
                        open_args = [output_file, 'a', gzip_level]
                    else:
                        try:
                            output_file = os.path.join(
                                    destination, 
                                    '.'.join([
                                        os.environ['mapred_task_partition'],
                                        str(file_number)
                                    ])
                                )
                        except KeyError:
                            output_file = os.path.join(
                                    destination, 
                                    '.'.join([
                                        os.environ['mapreduce_task_partition'],
                                        str(k), str(file_number)
                                    ])
                                )
                        open_args = [output_file, 'a']
                    try:
                        os.makedirs(os.path.dirname(output_file))
                    except OSError:
                        pass
                else:
                    open_args = []
                '''Use xopen to handle compressed streams and normal streams
                generally.'''
                with xopen(gzip_output if not to_stdout else '-', *open_args) \
                    as output_stream:
                    perform_push = False
                    line_numbers = [0, 0]
                    read_next_line = True
                    nucs_read = 0
                    pairs_read = 0
                    while True:
                        if read_next_line:
                            # Read next line only if FASTA mode didn't already
                            lines = []
                            for source_stream in source_streams:
                                lines.append(source_stream.readline())
                        read_next_line = True
                        if not lines[0]:
                            break_outer_loop = True
                            break
                        line_numbers = [i + 1 for i in line_numbers]
                        lines = [line.strip() for line in lines]
                        bad_record_skip = False
                        if lines[0][0] in fastq_cues:
                            if records_to_consume and not skipped:
                                '''Skip lines as necessary; for paired-end
                                reads skip the largest even number of records 
                                less than records_to_consume.'''
                                if len(source_urls) == 1:
                                    # single-end
                                    line_skip_count = max(
                                            skip_count * 4 - 1, 0
                                        )
                                else:
                                    # paired-end
                                    line_skip_count = max(
                                            ((skip_count / 2) * 4 - 1), 0
                                        )
                                    for _ in xrange(line_skip_count):
                                        next(source_stream_2)
                                for _ in xrange(line_skip_count):
                                    next(source_stream_1)
                                if skip_count:
                                    lines = []
                                    for source_stream in source_streams:
                                        lines.append(source_stream.readline())
                                    if not lines[0]:
                                        break_outer_loop = True
                                        break
                                    lines = [line.strip() for line in lines]
                                skipped = True
                            seqs = [source_stream.readline().strip()
                                        for source_stream in source_streams]
                            line_numbers = [i + 1 for i in line_numbers]
                            plus_lines = [source_stream.readline().strip()
                                            for source_stream
                                            in source_streams]
                            line_numbers = [i + 1 for i in line_numbers]
                            quals = [source_stream.readline().strip()
                                        for source_stream in source_streams]
                            if reorganize and sra_paired_end:
                                # Fix order!
                                lines, seqs, plus_lines, quals = (
                                        [lines[0], plus_lines[0]],
                                        [lines[1], plus_lines[1]],
                                        [seqs[0], quals[0]],
                                        [seqs[1], quals[1]]
                                    )
                            try:
                                assert plus_lines[0][0] == '+', (
                                        'Malformed read "%s" at line %d of '
                                        'file "%s".'
                                    ) % (lines[0], line_numbers[0], sources[0])
                                if plus_lines[1]:
                                    assert plus_lines[1][0] == '+', (
                                            'Malformed read "%s" at line %d '
                                            'of file "%s".'
                                        ) % (
                                        lines[1], line_numbers[1], sources[1]
                                    )
                                try:
                                    # Kill spaces in name
                                    original_qnames = \
                                        [line[1:].replace(' ', '_')
                                            for line in lines]
                                except IndexError:
                                    raise RuntimeError(
                                            'Error finding QNAME at ' 
                                            'line %d of either %s or %s' % (
                                                        sources[0],
                                                        sources[1]
                                                    )
                                        )
                            except (AssertionError,
                                    IndexError, RuntimeError) as e:
                                if skip_bad_records:
                                    print >>sys.stderr, ('Error "%s" '
                                            'encountered; skipping bad record.'
                                        ) % e.message
                                    for source_stream in source_streams:
                                        source_stream.readline()
                                    line_numbers = [
                                            i + 1 for i in line_numbers
                                        ]
                                    bad_record_skip = True
                                else:
                                    raise
                            else:
                                try:
                                    quals = [
                                            qual_getter(qual) for qual in quals
                                        ]
                                except Exception as e:
                                    if skip_bad_records:
                                        print >>sys.stderr, (
                                                'Error "%s" encountered '
                                                'trying to convert quality '
                                                'string to Sanger format; '
                                                'skipping bad record.'
                                            ) % e.message
                                        bad_record_skip = True
                                    else:
                                        raise
                                line_numbers = [i + 1 for i in line_numbers]
                                try: 
                                    for i in xrange(2):
                                        assert len(seqs[i]) == len(quals[i]), (
                                            'Length of read sequence does not '
                                            'match length of quality string '
                                            'at line %d of file "%s".'
                                        ) % (line_numbers[i], sources[i])
                                except (AssertionError, IndexError) as e:
                                    if skip_bad_records:
                                        print >>sys.stderr, (
                                                'Error "%s" encountered; '
                                                'skipping bad record.'
                                            ) % e.message
                                        bad_record_skip = True
                                    else:
                                        raise
                        elif lines[0][0] in fasta_cues:
                            seqs = [[], []]
                            next_lines = []
                            for p, source_stream in enumerate(source_streams):
                                while True:
                                    next_line \
                                        = source_stream.readline().strip()
                                    try:
                                        if next_line[0] in fasta_cues:
                                            break
                                        else:
                                            try:
                                                seqs[p].append(next_line)
                                            except IndexError:
                                                raise
                                    except IndexError:
                                        break
                                next_lines.append(next_line)
                            seqs = [''.join(seq) for seq in seqs]
                            line_numbers = [i + 1 for i in line_numbers]
                            try:
                                try:
                                    # Kill spaces in name
                                    original_qnames = \
                                        [line[1:].replace(' ', '_')
                                            for line in lines]
                                except IndexError:
                                    raise RuntimeError(
                                            'Error finding QNAME at ' 
                                            'line %d of either %s or %s' % (
                                                        sources[0],
                                                        sources[1]
                                                    )
                                        )
                            except (AssertionError,
                                    IndexError, RuntimeError) as e:
                                if skip_bad_records:
                                    print >>sys.stderr, ('Error "%s" '
                                            'encountered; skipping bad record.'
                                        ) % e.message
                                    for source_stream in source_streams:
                                        source_stream.readline()
                                    line_numbers = [
                                            i + 1 for i in line_numbers
                                        ]
                                    bad_record_skip = True
                                else:
                                    raise
                            else:
                                try:
                                    quals = [
                                        'h'*len(seq) for seq in seqs
                                        ]
                                except Exception as e:
                                    if skip_bad_records:
                                        print >>sys.stderr, (
                                                'Error "%s" encountered '
                                                'trying to convert quality '
                                                'string to Sanger format; '
                                                'skipping bad record.'
                                            ) % e.message
                                        bad_record_skip = True
                                    else:
                                        raise
                                line_numbers = [i + 1 for i in line_numbers]
                            lines = next_lines
                            read_next_line = False
                        if bad_record_skip:
                            seqs = []
                            # Fake record-printing to get to records_to_consume
                            if source_streams[-1].name == os.devnull:
                                records_printed += 1
                            else:
                                records_printed += 2
                        elif len(original_qnames) == 2 and original_qnames[1]:
                            # Paired-end write
                            if original_qnames[0] == original_qnames[1]:
                                # Add paired-end identifiers
                                original_qnames[0] += '/1'
                                original_qnames[1] += '/2'
                            assert seqs[1]
                            assert quals[1]
                            seqs = [seq.upper() for seq in seqs]
                            reversed_complement_seqs = [
                                    seqs[0][::-1].translate(
                                        _reversed_complement_translation_table
                                    ),
                                    seqs[1][::-1].translate(
                                        _reversed_complement_translation_table
                                    )
                                ]
                            if seqs[0] < reversed_complement_seqs[0]:
                                left_seq = seqs[0]
                                left_qual = quals[0]
                                left_reversed = '0'
                            else:
                                left_seq = reversed_complement_seqs[0]
                                left_qual = quals[0][::-1]
                                left_reversed = '1'
                            if seqs[1] < reversed_complement_seqs[1]:
                                right_seq = seqs[1]
                                right_qual = quals[1]
                                right_reversed = '0'
                            else:
                                right_seq = reversed_complement_seqs[1]
                                right_qual = quals[1][::-1]
                                right_reversed = '1'
                            if short_qnames:
                                left_qname_to_write = encode(read_index) + '/1'
                                right_qname_to_write = encode(
                                                            read_index
                                                        ) + '/2'
                            else:
                                left_qname_to_write = original_qnames[0]
                                right_qname_to_write = original_qnames[1]
                            print >>output_stream, '\t'.join(
                                        [
                                            left_seq,
                                            left_reversed,
                                            qname_from_read(
                                                    left_qname_to_write,
                                                    seqs[0] + quals[0], 
                                                    sample_label,
                                                    mate=seqs[1]
                                                ),
                                            '\n'.join([
                                                round_quality_string(
                                                    left_qual
                                                ), right_seq
                                            ]),
                                            right_reversed,
                                            qname_from_read(
                                                    right_qname_to_write,
                                                    seqs[1] + quals[1], 
                                                    sample_label,
                                                    mate=seqs[0]
                                                ),
                                            round_quality_string(right_qual)
                                        ]
                                    )
                            records_printed += 2
                            _output_line_count += 1
                        else:
                            seqs[0] = seqs[0].upper()
                            reversed_complement_seqs = [
                                    seqs[0][::-1].translate(
                                        _reversed_complement_translation_table
                                    )
                                ]
                            # Single-end write
                            if seqs[0] < reversed_complement_seqs[0]:
                                seq = seqs[0]
                                qual = quals[0]
                                is_reversed = '0'
                            else:
                                seq = reversed_complement_seqs[0]
                                qual = quals[0][::-1]
                                is_reversed = '1'
                            if short_qnames:
                                qname_to_write = encode(read_index)
                            else:
                                qname_to_write = original_qnames[0]
                            print >>output_stream, '\t'.join(
                                        [
                                            seq,
                                            is_reversed,
                                            qname_from_read(
                                                qname_to_write,
                                                seqs[0] + quals[0], 
                                                sample_label
                                            ),
                                            round_quality_string(qual)
                                        ]
                                    )
                            records_printed += 1
                            _output_line_count += 1
                        read_index += 1
                        for seq in seqs:
                            nucs_read += len(seq)
                        if records_printed == records_to_consume:
                            break_outer_loop = True
                            perform_push = True
                            break
                        if not to_stdout and not records_to_consume and \
                            nucs_read > nucleotides_per_input:
                            file_number += 1
                            break
                if verbose:
                    print >>sys.stderr, (
                            'Exited with statement; line numbers are %s' 
                            % line_numbers
                        )
                if (not to_stdout) and (push_url.is_nfs or
                    push_url.is_s3 or push_url.is_hdfs) \
                    and ((not records_to_consume) or
                         (records_to_consume and perform_push)):
                    print >>sys.stderr, 'Pushing "%s" to "%s" ...' % (
                                                            output_file,
                                                            push_url.to_url()
                                                        )
                    print >>sys.stderr, 'reporter:status:alive'
                    mover.put(output_file, push_url.plus(os.path.basename(
                                                                output_file
                                                            )))
                    try:
                        os.remove(output_file)
                    except OSError:
                        pass
                if break_outer_loop: break
            if verbose:
                print >>sys.stderr, 'Exiting source streams...'
        if verbose:
            print >>sys.stderr, 'Exited source streams.'
        # Clear temporary directory
        for input_file in os.listdir(temp_dir):
            try:
                os.remove(os.path.join(temp_dir, input_file))
            except OSError:
                pass
        if 'sra_process' in locals():
            sra_process.stdout.close()
            sra_return_code = sra_process.wait()
            if sra_return_code > 0:
                raise RuntimeError(('fastq-dump terminated with exit '
                                    'code %d. Command run was "%s".')
                                        % (sra_return_code,
                                            fastq_dump_command))
            del sra_process
示例#2
0
    help='Print out extra debugging statements')

filemover.add_args(parser)
tempdel.add_args(parser)
args = parser.parse_args()

import time
start_time = time.time()

input_line_count = 0

if args.out is not None:
    '''If --out is a local file, just write directly to that file. Otherwise,
    write to a temporary file that will later be uploaded to the
    destination.'''
    output_url = Url(args.out)
    if output_url.is_local:
        try: os.makedirs(output_url.to_url())
        except: pass
        output_filename = os.path.join(args.out, args.junction_filename)
    else:
        temp_dir_path = make_temp_dir(tempdel.silentexpandvars(args.scratch))
        register_cleanup(tempdel.remove_temporary_directories,
                            [temp_dir_path])
        output_filename = args.junction_filename + '.temp'
        output_filename = os.path.join(temp_dir_path, output_filename)
    with xopen(True, output_filename, 'w', args.gzip_level) as output_stream:
        for line in sys.stdin:
            tokens = line.strip().split('\t')
            # Remove leading zeros from ints
            print >>output_stream, '\t'.join(
示例#3
0
    help='Basename for index to be written')
parser.add_argument(\
    '--keep-alive', action='store_const', const=True, default=False,
    help='Prints reporter:status:alive messages to stderr to keep EMR '
         'task alive')

filemover.add_args(parser)
bowtie.add_args(parser)
tempdel.add_args(parser)
args = parser.parse_args()

import time
start_time = time.time()

output_filename, output_stream, output_url = [None] * 3
output_url = Url(args.out) if args.out is not None \
    else Url(os.getcwd())
# Set up temporary destination
import tempfile
temp_dir_path = make_temp_dir(tempdel.silentexpandvars(args.scratch))
# For deleting temporary directory, even on unexpected exit
register_cleanup(tempdel.remove_temporary_directories, [temp_dir_path])
# Set up temporary destination
try:
    os.makedirs(os.path.join(temp_dir_path, 'index'))
except:
    pass
# Write to temporary directory, and later upload to URL
index_basename = os.path.join(temp_dir_path, 'index/' + args.basename)
fasta_file = os.path.join(temp_dir_path, 'temp.fa')
print >> sys.stderr, 'Opened %s for writing....' % fasta_file
示例#4
0
文件: bed.py 项目: BenLangmead/rail
if args.keep_alive:
    from dooplicity.tools import KeepAlive
    keep_alive_thread = KeepAlive(sys.stderr)
    keep_alive_thread.start()

import time
start_time = time.time()

reference_index = bowtie_index.BowtieIndexReference(
                            os.path.expandvars(args.bowtie_idx)
                        )
# For mapping sample indices back to original sample labels
manifest_object = manifest.LabelsAndIndices(
                            os.path.expandvars(args.manifest)
                        )
output_url = Url(args.out) if args.out is not None \
    else Url(os.getcwd())
input_line_count = 0
counter = Counter('bed')
register_cleanup(counter.flush)

if output_url.is_local:
    # Set up destination directory
    try: os.makedirs(output_url.to_url())
    except: pass
else:
    mover = filemover.FileMover(args=args)
    # Set up temporary destination
    import tempfile
    temp_dir_path = make_temp_dir(tempdel.silentexpandvars(args.scratch))
    register_cleanup(tempdel.remove_temporary_directories, [temp_dir_path])
for (line_type, sample_label), xpartition in xstream(sys.stdin, 2):
示例#5
0
    help='Basename for index to be written')
parser.add_argument(\
    '--keep-alive', action='store_const', const=True, default=False,
    help='Prints reporter:status:alive messages to stderr to keep EMR '
         'task alive')

filemover.add_args(parser)
bowtie.add_args(parser)
tempdel.add_args(parser)
args = parser.parse_args()

import time
start_time = time.time()

output_filename, output_stream, output_url = [None]*3
output_url = Url(args.out) if args.out is not None \
    else Url(os.getcwd())
# Set up temporary destination
import tempfile
temp_dir_path = make_temp_dir(tempdel.silentexpandvars(args.scratch))
# For deleting temporary directory, even on unexpected exit
register_cleanup(tempdel.remove_temporary_directories, [temp_dir_path])
# Set up temporary destination
try: os.makedirs(os.path.join(temp_dir_path, 'index'))
except: pass
# Write to temporary directory, and later upload to URL
index_basename = os.path.join(temp_dir_path, 'index/' + args.basename)
fasta_file = os.path.join(temp_dir_path, 'temp.fa')
print >>sys.stderr, 'Opened %s for writing....' % fasta_file
with open(fasta_file, 'w') as fasta_stream:
    input_line_count = 0
    for line in sys.stdin:
示例#6
0
            print 'counts\t-\t%s\t%s\t%d\t%d' % (sample_index, rname_index,
                                                 total_count, unique_count)
else:
    # Grab stats _and_ output SAM/BAMs
    if not args.output_sam:
        # Only need subprocess to start samtools if outputting bam
        import subprocess

    # Get RNAMEs in order of descending length
    sorted_rnames = [reference_index.string_to_rname['%012d' % i]
                        for i in xrange(
                                    len(reference_index.string_to_rname) - 1
                                )]
    total_count, unique_count = 0, 0
    if args.out is not None:
        output_url = Url(args.out)
        if output_url.is_local:
            # Set up destination directory
            try: os.makedirs(output_url.to_url())
            except: pass
            output_dir = args.out
        else:
            mover = filemover.FileMover(args=args)
            # Set up temporary destination
            import tempfile
            temp_dir_path = make_temp_dir(
                                tempdel.silentexpandvars(args.scratch)
                            )
            register_cleanup(tempdel.remove_temporary_directories,
                                [temp_dir_path])
            output_dir = temp_dir_path
示例#7
0
def go(input_stream=sys.stdin,
       output_stream=sys.stdout,
       bowtie2_exe='bowtie2',
       bowtie2_index_base='genome',
       bowtie2_args='',
       verbose=False,
       report_multiplier=1.2,
       stranded=False,
       fudge=5,
       score_min=60,
       gzip_level=3,
       mover=filemover.FileMover(),
       intermediate_dir='.',
       scratch=None):
    """ Runs Rail-RNA-cointron_enum 

        Alignment script for MapReduce pipelines that wraps Bowtie 2. Finds
        introns that cooccur on reads by local alignments to transcriptome
        elements from Bowtie 2.

        Input (read from stdin)
        ----------------------------
        Tab-delimited output tuple columns (readletize)
        1. SEQ or its reversed complement, whichever is first in alphabetical
            order
        2. Comma-separated list of sample labels if field 1 is the read
            sequence; '\x1c' if empty
        3. Comma-separated list of sample labels if field 1 is the reversed
            complement of the read sequence; '\x1c' if empty

        Hadoop output (written to stdout)
        ----------------------------
        Tab-delimited tuple columns:
        1. Reference name (RNAME in SAM format) + 
            '+' or '-' indicating which strand is the sense strand
        2. Comma-separated list of intron start positions in configuration
        3. Comma-separated list of intron end positions in configuration
        4. left_extend_size: by how many bases on the left side of an intron
            the reference should extend
        5. right_extend_size: by how many bases on the right side of an intron
            the reference should extend
        6. Read sequence

        input_stream: where to find input reads.
        output_stream: where to emit exonic chunks and introns.
        bowtie2_exe: filename of Bowtie 2 executable; include path if not in
            $PATH.
        bowtie2_index_base: the basename of the Bowtie index files associated
            with the reference.
        bowtie2_args: string containing precisely extra command-line arguments
            to pass to Bowtie 2, e.g., "--tryhard --best"; or None.
        verbose: True iff more informative messages should be written to
            stderr.
        report_multiplier: if verbose is True, the line number of an alignment
            written to stderr increases exponentially with base
            report_multiplier.
        stranded: True iff input reads are strand-specific; this affects
            whether an output partition has a terminal '+' or '-' indicating
            the sense strand. Further, if stranded is True, an alignment is
            returned only if its strand agrees with the intron's strand.
        fudge: by how many bases to extend left and right extend sizes
                to accommodate potential indels
        score_min: Bowtie2 CONSTANT minimum alignment score
        gzip_level: compression level to use for temporary files
        mover: FileMover object, for use in case Bowtie2 idx needs to be
            pulled from S3
        intermediate_dir: where intermediates are stored; for temporarily
            storing transcript index if it needs to be pulled from S3
        scratch: scratch directory for storing temporary files or None if 
            securely created temporary directory

        No return value.
    """
    bowtie2_index_base_url = Url(bowtie2_index_base)
    if bowtie2_index_base_url.is_s3:
        index_basename = os.path.basename(bowtie2_index_base)
        index_directory = os.path.join(intermediate_dir, 'transcript_index')
        if not os.path.exists(os.path.join(index_directory, '_STARTED')):
            # Download index
            with open(os.path.join(index_directory, '_STARTED'), 'w') \
                as started_stream:
                print >> started_stream, 'STARTED'
            for extension in [
                    '.1.bt2', '.2.bt2', '.3.bt2', '.4.bt2', '.rev.1.bt2',
                    '.rev.2.bt2'
            ]:
                mover.get(bowtie2_index_base_url, index_directory)
            with open(os.path.join(index_directory, '_SUCCESS'), 'w') \
                as success_stream:
                print >> success_stream, 'SUCCESS'
        while not os.path.exists(os.path.join(index_directory, '_SUCCESS')):
            time.sleep(0.5)
        bowtie2_index_base = os.path.join(index_directory, index_basename)
    global _input_line_count
    temp_dir_path = make_temp_dir(scratch)
    register_cleanup(tempdel.remove_temporary_directories, [temp_dir_path])
    reads_file = os.path.join(temp_dir_path, 'reads.temp.gz')
    with xopen(True, reads_file, 'w', gzip_level) as reads_stream:
        for _input_line_count, line in enumerate(input_stream):
            seq = line.strip()
            print >> reads_stream, '\t'.join([seq, seq, 'I' * len(seq)])
    input_command = 'gzip -cd %s' % reads_file
    bowtie_command = ' '.join([
        bowtie2_exe, bowtie2_args if bowtie2_args is not None else '',
        ' --local -t --no-hd --mm -x', bowtie2_index_base, '--12 -',
        '--score-min L,%d,0' % score_min, '-D 24 -R 3 -N 1 -L 20 -i L,4,0'
    ])
    delegate_command = ''.join([
        sys.executable, ' ',
        os.path.realpath(__file__)[:-3],
        '_delegate.py --report-multiplier %08f --fudge %d %s %s' %
        (report_multiplier, fudge, '--stranded' if stranded else '',
         '--verbose' if verbose else '')
    ])
    full_command = ' | '.join(
        [input_command, bowtie_command, delegate_command])
    print >> sys.stderr, 'Starting Bowtie2 with command: ' + full_command
    bowtie_process = subprocess.Popen(' '.join(
        ['set -exo pipefail;', full_command]),
                                      bufsize=-1,
                                      stdout=sys.stdout,
                                      stderr=sys.stderr,
                                      shell=True,
                                      executable='/bin/bash')
    return_code = bowtie_process.wait()
    if return_code:
        raise RuntimeError('Error occurred while reading Bowtie 2 output; '
                           'exitlevel was %d.' % return_code)
示例#8
0
                    )
# For mapping sample indices back to original sample labels
manifest_object = manifest.LabelsAndIndices(
                        os.path.expandvars(args.manifest)
                    )
# Create file with chromosome sizes for bedTobigwig
sizes_filename = os.path.join(temp_dir_path, 'chrom.sizes')
if args.verbose:
    print >>sys.stderr, 'Sizes file: %s .' % sizes_filename
with open(sizes_filename, 'w') as sizes_stream:
    for rname in reference_index.rname_lengths:
        print >>sizes_stream, '%s %d' % (rname, 
            reference_index.rname_lengths[rname])

input_line_count, output_line_count = 0, 0
output_url = Url(args.out)
if output_url.is_local:
    # Set up destination directory
    try: os.makedirs(output_url.to_url())
    except: pass
mover = filemover.FileMover(args=args)
track_line = ('track type=bedGraph name="{name}" '
         'description="{description}" visibility=full '
         'color=227,29,118 altColor=0,179,220 priority=400')
for (sample_index,), xpartition in xstream(sys.stdin, 1):
    counter.add('partitions')
    real_sample = True
    try:
        sample_label = manifest_object.index_to_label[sample_index]
    except KeyError:
        # It's a nonref track, a mean, or a median
示例#9
0
                    type=str,
                    required=False,
                    default='split.manifest',
                    help='Output manifest filename')

# Add scratch command-line parameter
tempdel.add_args(parser)

args = parser.parse_args(sys.argv[1:])

start_time = time.time()
input_line_count, output_line_count = 0, 0
counter = Counter('assign_splits')
register_cleanup(counter.flush)

output_url = Url(args.out) if args.out is not None else Url(os.getcwd())
if output_url.is_local:
    # Set up destination directory
    try:
        os.makedirs(output_url.to_url())
    except:
        pass
    output_path = os.path.join(args.out, args.filename)
else:
    mover = filemover.FileMover(args=args)
    print >> sys.stderr, 'Instantiated FileMover.'
    # Set up temporary destination
    import tempfile
    from dooplicity.tools import make_temp_dir
    temp_dir_path = make_temp_dir(tempdel.silentexpandvars(args.scratch))
    register_cleanup(tempdel.remove_temporary_directories, [temp_dir_path])
示例#10
0
formed; reference_index.rname_lengths[RNAME] is the length of RNAME.'''
reference_index = bowtie_index.BowtieIndexReference(
    os.path.expandvars(args.bowtie_idx))
# For mapping sample indices back to original sample labels
manifest_object = manifest.LabelsAndIndices(os.path.expandvars(args.manifest))
# Create file with chromosome sizes for bedTobigwig
sizes_filename = os.path.join(temp_dir_path, 'chrom.sizes')
if args.verbose:
    print >> sys.stderr, 'Sizes file: %s .' % sizes_filename
with open(sizes_filename, 'w') as sizes_stream:
    for rname in reference_index.rname_lengths:
        print >> sizes_stream, '%s %d' % (rname,
                                          reference_index.rname_lengths[rname])

input_line_count, output_line_count = 0, 0
output_url = Url(args.out)
if output_url.is_local:
    # Set up destination directory
    try:
        os.makedirs(output_url.to_url())
    except:
        pass
mover = filemover.FileMover(args=args)
track_line = ('track type=bedGraph name="{name}" '
              'description="{description}" visibility=full '
              'color=227,29,118 altColor=0,179,220 priority=400')
for (sample_index, ), xpartition in xstream(sys.stdin, 1):
    try:
        sample_label = manifest_object.index_to_label[sample_index]
    except KeyError:
        # It's a mean or median
示例#11
0
    help='Basename for index to be written')
parser.add_argument(\
    '--keep-alive', action='store_const', const=True, default=False,
    help='Prints reporter:status:alive messages to stderr to keep EMR '
         'task alive')

filemover.add_args(parser)
bowtie.add_args(parser)
tempdel.add_args(parser)
args = parser.parse_args()

import time
start_time = time.time()

output_filename, output_stream, output_url = [None]*3
output_url = Url(args.out) if args.out is not None \
    else Url(os.getcwd())
# Set up temporary destination
import tempfile
temp_dir_path = make_temp_dir(tempdel.silentexpandvars(args.scratch))
# For deleting temporary directory, even on unexpected exit
register_cleanup(tempdel.remove_temporary_directories, [temp_dir_path])
# Set up temporary destination
try: os.makedirs(os.path.join(temp_dir_path, 'index'))
except: pass
# Write to temporary directory, and later upload to URL
index_basename = os.path.join(temp_dir_path, 'index/' + args.basename)
fasta_file = os.path.join(temp_dir_path, 'temp.fa')
print >>sys.stderr, 'Opened %s for writing....' % fasta_file
with open(fasta_file, 'w') as fasta_stream:
    input_line_count = 0
    for line in sys.stdin:
示例#12
0
filemover.add_args(parser)
tempdel.add_args(parser)
args = parser.parse_args()

import time
start_time = time.time()

input_line_count = 0
counter = Counter('junction_collect')
register_cleanup(counter.flush)

if args.out is not None:
    '''If --out is a local file, just write directly to that file. Otherwise,
    write to a temporary file that will later be uploaded to the
    destination.'''
    output_url = Url(args.out)
    if output_url.is_local:
        try: os.makedirs(output_url.to_url())
        except: pass
        output_filename = os.path.join(args.out, args.junction_filename)
    else:
        temp_dir_path = make_temp_dir(tempdel.silentexpandvars(args.scratch))
        register_cleanup(tempdel.remove_temporary_directories,
                            [temp_dir_path])
        output_filename = args.junction_filename + '.temp'
        output_filename = os.path.join(temp_dir_path, output_filename)
    with xopen(True, output_filename, 'w', args.gzip_level) as output_stream:
        for line in sys.stdin:
            counter.add('inputs')
            tokens = line.strip().split('\t')
            # Remove leading zeros from ints
示例#13
0
else:
    # Grab stats _and_ output SAM/BAMs
    if not args.output_sam:
        # Only need subprocess to start samtools if outputting bam
        import subprocess

    # Get RNAMEs in order of descending length
    sorted_rnames = [reference_index.string_to_rname['%012d' % i]
                        for i in xrange(
                                    len(reference_index.string_to_rname) - 1
                                )]
    (output_path, output_filename, output_stream, output_url,
        last_rname, last_sample_label) = [None]*6
    total_count, unique_count = 0, 0
    if args.out is not None:
        output_url = Url(args.out)
        if output_url.is_local:
            # Set up destination directory
            try: os.makedirs(output_url.to_url())
            except: pass
        else:
            mover = filemover.FileMover(args=args)
            # Set up temporary destination
            import tempfile
            temp_dir_path = make_temp_dir(tempdel.silentexpandvars(args.scratch))
            register_cleanup(tempdel.remove_temporary_directories,
                                [temp_dir_path])
    move_temporary_file = False # True when temporary file should be uploaded
    while True:
        line = sys.stdin.readline()
        if not line:
示例#14
0
for input_line_count, line in enumerate(sys.stdin):
    # Kill offset from start of manifest file
    tokens = line.strip().split('\t')[1:]
    try:
        stripped = tokens[0].strip()
        if stripped[0] == '#' or not line.strip():
            continue
    except IndexError:
        continue
    token_count = len(tokens)
    assert token_count in [
        3, 5
    ], ('Line {} of input has {} fields, but 3 or 5 are expected.').format(
        input_line_count + 1, token_count)
    file_to_count = tokens[0]
    if (not ((token_count == 3 and Url(tokens[0]).is_local) or
             (token_count == 5 and Url(tokens[0]).is_local
              and Url(tokens[2]).is_local))):
        sys.stdout.write(line)
        output_line_count += 1
        continue
    with xopen(None, file_to_count) as input_stream:
        first_char = input_stream.readline()[0]
        if first_char in fastq_cues:
            # 4 lines per record
            line_divider = 4
        elif first_char in fasta_cues:
            line_divider = 2
        else:
            raise RuntimeError(
                'File "{}" is neither a FASTA nor a FASTQ file.'.format(