示例#1
0
    counter.add('bowtie_build_processes')
    bowtie_build_process = subprocess.Popen(
        [args.bowtie2_build_exe, fasta_file, index_basename],
        stderr=sys.stderr,
        stdout=sys.stderr)
    bowtie_build_process.wait()
    if bowtie_build_process.returncode:
        raise RuntimeError(
            'Bowtie index construction failed w/ exitlevel %d.' %
            bowtie_build_process.returncode)

# Compress index files
print >> sys.stderr, 'Compressing isofrag index...'
junction_index_filename = args.basename + '.tar.gz'
junction_index_path = os.path.join(temp_dir_path, junction_index_filename)
index_path = os.path.join(temp_dir_path, 'index')
tar = tarfile.TarFile.gzopen(junction_index_path, mode='w', compresslevel=3)
for index_file in os.listdir(index_path):
    tar.add(os.path.join(index_path, index_file), arcname=index_file)
tar.close()
counter.add('junction_index_archive_bytes',
            os.path.getsize(junction_index_path))
# Upload compressed index
print >> sys.stderr, 'Uploading or copying compressed index...'
counter.add('files_moved')
mover = filemover.FileMover(args=args)
mover.put(junction_index_path, output_url.plus(junction_index_filename))

print >>sys.stderr, 'DONE with junction_index.py; in=%d; time=%0.3f s' \
                        % (input_line_count, time.time() - start_time)
示例#2
0
def go(nucleotides_per_input=8000000, gzip_output=True, gzip_level=3,
        to_stdout=False, push='.', mover=filemover.FileMover(),
        verbose=False, scratch=None, bin_qualities=True, short_qnames=False,
        skip_bad_records=False, workspace_dir=None,
        fastq_dump_exe='fastq-dump', ignore_missing_sra_samples=False):
    """ Runs Rail-RNA-preprocess

        Input (read from stdin)
        ----------------------------
        Tab-separated fields:
        ---If URL is local:
        1. #!splitload
        2. \x1d-separated list of 0-based indexes of reads at which to start
            each new file
        3. \x1d-separated list of numbers of reads to include in gzipped files
        4. \x1d-separated list of manifest lines whose tabs are replaced by
            \x1es

        ---Otherwise:
        manifest line

        A manifest line has the following format

        (for single-end reads)
        <URL>(tab)<Optional MD5>(tab)<Sample label>

        (for paired-end reads)
        <URL 1>(tab)<Optional MD5 1>(tab)<URL 2>(tab)<Optional MD5 2>(tab)
        <Sample label>

        Hadoop output (written to stdout)
        ----------------------------
        None.

        Other output (written to directory specified by command-line parameter
            --push)
        ____________________________
        Files containing input data in one of the following formats:

        Format 1 (single-end, 3-column):
          1. Nucleotide sequence or its reversed complement, whichever is first
            in alphabetical order
          2. 1 if sequence was reverse-complemented else 0
          3. Name
          4. Quality sequence or its reverse, whichever corresponds to field 1

        Format 2 (paired, 2 lines, 3 columns each)
        (so this is the same as single-end)
          1. Nucleotide sequence for mate 1 or its reversed complement,
            whichever is first in alphabetical order
          2. 1 if sequence was reverse-complemented else 0
          3. Name for mate 1
          4. Quality sequence for mate 1 or its reverse, whichever corresponds
            to field 1
            
            (new line)

          1. Nucleotide sequence for mate 2 or its reversed complement,
            whichever is first in alphabetical order
          2. 1 if sequence was reverse complemented else 0
          3. Name for mate 2
          4. Quality sequence for mate 2 or its reverse, whichever corresponds
            to field 1

        Quality sequences are strings of Is for FASTA input.

        nucleotides_per_input: maximum number of nucleotides to put in a given
            input file
        gzip_output: True iff preprocessed input should be gzipped
        gzip_level: level of gzip compression to use
        push: where to send output
        verbose: True iff extra debugging statements should be printed to
            stderr
        scratch: scratch directory for storing temporary files or None if 
            securely created temporary directory
        bin_qualities: True iff quality string should be binned according to
            rules in _mismatch_penalties_to_quality_scores
            and round_quality_string() defined in go()
        short_qnames: True iff original qname should be killed and a new qname
            should be written in a short base64-encoded format
        skip_bad_records: True iff bad records should be skipped; otherwise,
            raises exception if bad record is encountered
        workspace_dir: where to use fastq-dump -- needed for working with
            dbGaP data. None if temporary dir should be used.
        fastq_dump_exe: path to fastq-dump executable
        ignore_missing_sra_samples: does not return error if fastq-dump doesn't
            find a sample

        No return value
    """
    if bin_qualities:
        import math
        def round_quality_string(qual):
            """ Bins phred+33 quality string to improve compression.

                Uses 5-bin scheme that does not affect Bowtie 2 alignments

                qual: quality string

                Return value: "binned" quality string.
            """
            return ''.join(
                [str(int(
                    _MN + math.floor((_MX - _MN) * min(
                                                    ord(qual_char) - 33.0, 40.0
                                                ) / 40.0)
                        )) for qual_char in qual]).translate(
                                _mismatch_penalties_to_quality_scores
                            )
    else:
        def round_quality_string(qual):
            """ Leaves quality string unbinned and untouched.

                qual: quality string

                Return value: qual
            """
            return qual
    global _input_line_count, _output_line_count
    skip_stubs = False
    temp_dir = make_temp_dir(scratch)
    print >>sys.stderr, 'Created local destination directory "%s".' % temp_dir
    register_cleanup(tempdel.remove_temporary_directories, [temp_dir])
    input_line_count, output_line_count = 0, 0
    if not to_stdout:
        push_url = Url(push)
        if push_url.is_local:
            destination = push
        elif push_url.is_s3 or push_url.is_hdfs or push_url.is_nfs:
            destination = temp_dir
        else:
            raise RuntimeError('Push destination must be '
                               'on S3, HDFS, NFS, or local.')
    fastq_cues = set(['@'])
    fasta_cues = set(['>', ';'])
    source_dict = {}
    onward = False
    for line in sys.stdin:
        _input_line_count += 1
        if not line.strip(): continue
        # Kill offset from start of manifest file
        try:
            tokens = line.strip().split('\t')[1:]
            if tokens[0][0] == '#' and tokens[0] != '#!splitload':
                # Comment line
                continue
        except IndexError:
            # Be robust to bad lines
            continue
        token_count = len(tokens)
        qual_getter = None
        if tokens[0] == '#!splitload':
            '''Line specifies precisely how records from files should be
            placed.'''
            assert not to_stdout, ('Split manifest line inconsistent with '
                                   'writing to stdout.')
            qual_getter = phred_converter(phred_format=tokens[-1])
            indexes = tokens[1].split('\x1d')
            read_counts = tokens[2].split('\x1d')
            manifest_lines = [token.split('\x1e')
                                for token in tokens[3].split('\x1d')]
            assert len(indexes) == len(read_counts) == len(manifest_lines)
            for i, manifest_line in enumerate(manifest_lines):
                manifest_line_field_count = len(manifest_line)
                if manifest_line_field_count == 3:
                    source_dict[(Url(manifest_line[0]),)] = (
                            manifest_line[-1],
                            int(indexes[i]),
                            int(read_counts[i])
                        )
                else:
                    assert manifest_line_field_count == 5
                    source_dict[(Url(manifest_line[0]),
                                 Url(manifest_line[2]))] = (
                                                        manifest_line[-1],
                                                        int(indexes[i]),
                                                        int(read_counts[i])
                                                    )
        elif token_count == 3:
            # SRA or single-end reads
            source_dict[(Url(tokens[0]),)] = (tokens[-1],)
        elif token_count == 5:
            # Paired-end reads
            source_dict[(Url(tokens[0]), Url(tokens[2]))] = (tokens[-1],)
        else:
            # Not a valid line, but continue for robustness
            continue
    file_number = 0
    for source_urls in source_dict:
        sample_label = source_dict[source_urls][0]
        downloaded = set()
        sources = []
        records_printed = 0
        if len(source_dict[source_urls]) == 3:
            skip_count = source_dict[source_urls][1]
            if len(source_urls) == 2:
                records_to_consume = source_dict[source_urls][2]
                if skip_count % 2:
                    skip_count -= 1
                    records_to_consume += 1
                if records_to_consume % 2:
                    records_to_consume -= 1
                # Index reads according to order in input to shorten read names
                read_index = skip_count / 2 # Index reads in pairs
            else:
                records_to_consume = source_dict[source_urls][2]
                read_index = skip_count
        else:
            skip_count = 0
            records_to_consume = None # Consume all records
            read_index = 0
        assert (records_to_consume >= 0 or records_to_consume is None), (
                'Negative value %d of records to consume encountered.'
            ) % records_to_consume
        if records_to_consume == 0: continue
        skipped = False
        for source_url in source_urls:
            if not source_url.is_local:
                # Download
                print >>sys.stderr, 'Retrieving URL "%s"...' \
                    % source_url.to_url()
                if source_url.is_dbgap:
                    download_dir = workspace_dir
                elif source_url.is_sra:
                    download_dir = temp_dir
                if source_url.is_sra:
                    sra_accession = source_url.to_url()
                    fastq_dump_command = (
                            'set -exo pipefail; cd {download_dir}; '
                            '{fastq_dump_exe} -I -X 10000 --split-files '
                            '{sra_accession}'
                        ).format(download_dir=download_dir,
                                    fastq_dump_exe=fastq_dump_exe,
                                    sra_accession=sra_accession)
                    try:
                        subprocess.check_call(
                            fastq_dump_command, shell=True, 
                            executable='/bin/bash',
                            stdout=sys.stderr
                        )
                    except subprocess.CalledProcessError as e:
                        if e.returncode == 3 and ignore_missing_sra_samples:
                            onward = True
                            break
                        else:
                            raise RuntimeError(
                                ('Error "%s" encountered executing '
                                 'command "%s".') % (e.output,
                                                        fastq_dump_command))
                    import glob
                    sra_fastq_files = sorted(
                                        glob.glob(os.path.join(download_dir,
                                            '%s[_.]*' % sra_accession))
                                        ) # ensure 1 before 2 if paired-end
                    # Schedule for deletion
                    def silent_remove(filename):
                        try:
                            os.remove(filename)
                        except OSError as e:
                            pass
                    for sra_fastq_file in sra_fastq_files:
                        register_cleanup(silent_remove, sra_fastq_file)
                    sra_file_count = len(sra_fastq_files)
                    check_for_paired = False
                    if sra_file_count == 1:
                        sra_paired_end = False
                        print >>sys.stderr, 'Detected single-end SRA sample.'
                    elif sra_file_count in [2, 3]:
                        print >>sys.stderr, ('2 or 3 FASTQ files detected. '
                                             'Checking for barcodes...')
                        check_for_paired = True
                    else:
                        raise RuntimeError(
                                ('Unexpected number of files "%d" output '
                                 'by fastq-dump command "%s".')
                                    % (sra_file_count, fastq_dump_command)
                            )
                    if check_for_paired:
                        # Get max/min read lengths from FASTQ
                        with open(
                                    sra_fastq_files[sra_file_count - 2]
                                ) as fastq_stream:
                            max_len, min_len = (
                                    max_min_read_lengths_from_fastq_stream(
                                        fastq_stream
                                    )
                                )
                            print >>sys.stderr, (
                                    'Max/min read length found in candidate '
                                    'barcode FASTQ was {}/{}.'
                                ).format(max_len, min_len)
                            if max_len <= _max_stubby_read_length:
                                print >>sys.stderr, (
                                        'Assumed barcode FASTQ.'
                                    )
                                skip_stubs = True
                                if sra_file_count == 2:
                                    sra_paired_end = False
                                else:
                                    sra_paired_end = True
                            else:
                                if sra_file_count == 2:
                                    sra_paired_end = True
                                else:
                                    raise RuntimeError(
                                        '3 FASTQs detected, but one of them '
                                        'was not recognized as containing '
                                        'barcodes.'
                                    )
                    # Guess quality from first 10k lines
                    with xopen(None, sra_fastq_files[0]) as source_stream:
                        qual_getter = phred_converter(
                                            fastq_stream=source_stream
                                        )
                    for sra_fastq_file in sra_fastq_files:
                        os.remove(sra_fastq_file)
                    sources.append(os.devnull)
                    fastq_dump_command = (
                            'set -exo pipefail; cd {download_dir}; '
                            '{fastq_dump_exe} --split-spot -I --stdout '
                            '{sra_accession}'
                        ).format(download_dir=download_dir,
                                    fastq_dump_exe=fastq_dump_exe,
                                    sra_accession=sra_accession)
                    if skip_stubs:
                        fastq_dump_command += (
                                ' | awk \'BEGIN {{OFS = "\\n"}} '
                                '{{header = $0; '
                                'getline seq; getline qheader; getline qseq; '
                                'if (length(seq) > {min_len}) {{print header, '
                                'seq, qheader, qseq}}}}\''
                            ).format(min_len=_max_stubby_read_length)
                    print >>sys.stderr, fastq_dump_command
                    sra_process = subprocess.Popen(fastq_dump_command,
                                                    shell=True,
                                                    executable='/bin/bash',
                                                    stdout=subprocess.PIPE,
                                                    bufsize=-1)
                else:
                    mover.get(source_url, temp_dir)
                    downloaded = list(
                            set(os.listdir(temp_dir)).difference(downloaded)
                        )
                    sources.append(os.path.join(temp_dir, list(downloaded)[0]))
            else:
                sources.append(source_url.to_url())
        if onward: continue
        '''Use os.devnull so single- and paired-end data can be handled in one
        loop.'''
        if len(sources) == 1:
            sources.append(os.devnull)
        if qual_getter is None:
            # Figure out Phred format
            with xopen(None, sources[0]) as source_stream:
                qual_getter = phred_converter(fastq_stream=source_stream)
        with xopen(None, sources[0]) as source_stream_1, xopen(
                None, sources[1]
            ) as source_stream_2:
            source_streams = [source_stream_1, source_stream_2]
            reorganize = all([source == os.devnull for source in sources])
            if reorganize:
                # SRA data is live
                if sra_paired_end:
                    source_streams = [sra_process.stdout, sra_process.stdout]
                else:
                    source_streams = [sra_process.stdout, open(os.devnull)]
            break_outer_loop = False
            while True:
                if not to_stdout:
                    '''Name files using Hadoop task environment property
                    mapred.task.partition.'''
                    if gzip_output:
                        try:
                            output_file = os.path.join(
                                    destination, 
                                    '.'.join([
                                        os.environ['mapred_task_partition'],
                                        str(file_number), 'gz'
                                    ])
                                )
                        except KeyError:
                            '''Hadoop 2.x: mapreduce.task.partition; see 
                            http://hadoop.apache.org/docs/r2.0.3-alpha/
                            hadoop-project-dist/hadoop-common/
                            DeprecatedProperties.html.'''
                            output_file = os.path.join(
                                    destination, 
                                    '.'.join([
                                        os.environ['mapreduce_task_partition'],
                                        str(file_number), 'gz'
                                    ])
                                )
                        open_args = [output_file, 'a', gzip_level]
                    else:
                        try:
                            output_file = os.path.join(
                                    destination, 
                                    '.'.join([
                                        os.environ['mapred_task_partition'],
                                        str(file_number)
                                    ])
                                )
                        except KeyError:
                            output_file = os.path.join(
                                    destination, 
                                    '.'.join([
                                        os.environ['mapreduce_task_partition'],
                                        str(k), str(file_number)
                                    ])
                                )
                        open_args = [output_file, 'a']
                    try:
                        os.makedirs(os.path.dirname(output_file))
                    except OSError:
                        pass
                else:
                    open_args = []
                '''Use xopen to handle compressed streams and normal streams
                generally.'''
                with xopen(gzip_output if not to_stdout else '-', *open_args) \
                    as output_stream:
                    perform_push = False
                    line_numbers = [0, 0]
                    read_next_line = True
                    nucs_read = 0
                    pairs_read = 0
                    while True:
                        if read_next_line:
                            # Read next line only if FASTA mode didn't already
                            lines = []
                            for source_stream in source_streams:
                                lines.append(source_stream.readline())
                        read_next_line = True
                        if not lines[0]:
                            break_outer_loop = True
                            break
                        line_numbers = [i + 1 for i in line_numbers]
                        lines = [line.strip() for line in lines]
                        bad_record_skip = False
                        if lines[0][0] in fastq_cues:
                            if records_to_consume and not skipped:
                                '''Skip lines as necessary; for paired-end
                                reads skip the largest even number of records 
                                less than records_to_consume.'''
                                if len(source_urls) == 1:
                                    # single-end
                                    line_skip_count = max(
                                            skip_count * 4 - 1, 0
                                        )
                                else:
                                    # paired-end
                                    line_skip_count = max(
                                            ((skip_count / 2) * 4 - 1), 0
                                        )
                                    for _ in xrange(line_skip_count):
                                        next(source_stream_2)
                                for _ in xrange(line_skip_count):
                                    next(source_stream_1)
                                if skip_count:
                                    lines = []
                                    for source_stream in source_streams:
                                        lines.append(source_stream.readline())
                                    if not lines[0]:
                                        break_outer_loop = True
                                        break
                                    lines = [line.strip() for line in lines]
                                skipped = True
                            seqs = [source_stream.readline().strip()
                                        for source_stream in source_streams]
                            line_numbers = [i + 1 for i in line_numbers]
                            plus_lines = [source_stream.readline().strip()
                                            for source_stream
                                            in source_streams]
                            line_numbers = [i + 1 for i in line_numbers]
                            quals = [source_stream.readline().strip()
                                        for source_stream in source_streams]
                            if reorganize and sra_paired_end:
                                # Fix order!
                                lines, seqs, plus_lines, quals = (
                                        [lines[0], plus_lines[0]],
                                        [lines[1], plus_lines[1]],
                                        [seqs[0], quals[0]],
                                        [seqs[1], quals[1]]
                                    )
                            try:
                                assert plus_lines[0][0] == '+', (
                                        'Malformed read "%s" at line %d of '
                                        'file "%s".'
                                    ) % (lines[0], line_numbers[0], sources[0])
                                if plus_lines[1]:
                                    assert plus_lines[1][0] == '+', (
                                            'Malformed read "%s" at line %d '
                                            'of file "%s".'
                                        ) % (
                                        lines[1], line_numbers[1], sources[1]
                                    )
                                try:
                                    # Kill spaces in name
                                    original_qnames = \
                                        [line[1:].replace(' ', '_')
                                            for line in lines]
                                except IndexError:
                                    raise RuntimeError(
                                            'Error finding QNAME at ' 
                                            'line %d of either %s or %s' % (
                                                        sources[0],
                                                        sources[1]
                                                    )
                                        )
                            except (AssertionError,
                                    IndexError, RuntimeError) as e:
                                if skip_bad_records:
                                    print >>sys.stderr, ('Error "%s" '
                                            'encountered; skipping bad record.'
                                        ) % e.message
                                    for source_stream in source_streams:
                                        source_stream.readline()
                                    line_numbers = [
                                            i + 1 for i in line_numbers
                                        ]
                                    bad_record_skip = True
                                else:
                                    raise
                            else:
                                try:
                                    quals = [
                                            qual_getter(qual) for qual in quals
                                        ]
                                except Exception as e:
                                    if skip_bad_records:
                                        print >>sys.stderr, (
                                                'Error "%s" encountered '
                                                'trying to convert quality '
                                                'string to Sanger format; '
                                                'skipping bad record.'
                                            ) % e.message
                                        bad_record_skip = True
                                    else:
                                        raise
                                line_numbers = [i + 1 for i in line_numbers]
                                try: 
                                    for i in xrange(2):
                                        assert len(seqs[i]) == len(quals[i]), (
                                            'Length of read sequence does not '
                                            'match length of quality string '
                                            'at line %d of file "%s".'
                                        ) % (line_numbers[i], sources[i])
                                except (AssertionError, IndexError) as e:
                                    if skip_bad_records:
                                        print >>sys.stderr, (
                                                'Error "%s" encountered; '
                                                'skipping bad record.'
                                            ) % e.message
                                        bad_record_skip = True
                                    else:
                                        raise
                        elif lines[0][0] in fasta_cues:
                            seqs = [[], []]
                            next_lines = []
                            for p, source_stream in enumerate(source_streams):
                                while True:
                                    next_line \
                                        = source_stream.readline().strip()
                                    try:
                                        if next_line[0] in fasta_cues:
                                            break
                                        else:
                                            try:
                                                seqs[p].append(next_line)
                                            except IndexError:
                                                raise
                                    except IndexError:
                                        break
                                next_lines.append(next_line)
                            seqs = [''.join(seq) for seq in seqs]
                            line_numbers = [i + 1 for i in line_numbers]
                            try:
                                try:
                                    # Kill spaces in name
                                    original_qnames = \
                                        [line[1:].replace(' ', '_')
                                            for line in lines]
                                except IndexError:
                                    raise RuntimeError(
                                            'Error finding QNAME at ' 
                                            'line %d of either %s or %s' % (
                                                        sources[0],
                                                        sources[1]
                                                    )
                                        )
                            except (AssertionError,
                                    IndexError, RuntimeError) as e:
                                if skip_bad_records:
                                    print >>sys.stderr, ('Error "%s" '
                                            'encountered; skipping bad record.'
                                        ) % e.message
                                    for source_stream in source_streams:
                                        source_stream.readline()
                                    line_numbers = [
                                            i + 1 for i in line_numbers
                                        ]
                                    bad_record_skip = True
                                else:
                                    raise
                            else:
                                try:
                                    quals = [
                                        'h'*len(seq) for seq in seqs
                                        ]
                                except Exception as e:
                                    if skip_bad_records:
                                        print >>sys.stderr, (
                                                'Error "%s" encountered '
                                                'trying to convert quality '
                                                'string to Sanger format; '
                                                'skipping bad record.'
                                            ) % e.message
                                        bad_record_skip = True
                                    else:
                                        raise
                                line_numbers = [i + 1 for i in line_numbers]
                            lines = next_lines
                            read_next_line = False
                        if bad_record_skip:
                            seqs = []
                            # Fake record-printing to get to records_to_consume
                            if source_streams[-1].name == os.devnull:
                                records_printed += 1
                            else:
                                records_printed += 2
                        elif len(original_qnames) == 2 and original_qnames[1]:
                            # Paired-end write
                            if original_qnames[0] == original_qnames[1]:
                                # Add paired-end identifiers
                                original_qnames[0] += '/1'
                                original_qnames[1] += '/2'
                            assert seqs[1]
                            assert quals[1]
                            seqs = [seq.upper() for seq in seqs]
                            reversed_complement_seqs = [
                                    seqs[0][::-1].translate(
                                        _reversed_complement_translation_table
                                    ),
                                    seqs[1][::-1].translate(
                                        _reversed_complement_translation_table
                                    )
                                ]
                            if seqs[0] < reversed_complement_seqs[0]:
                                left_seq = seqs[0]
                                left_qual = quals[0]
                                left_reversed = '0'
                            else:
                                left_seq = reversed_complement_seqs[0]
                                left_qual = quals[0][::-1]
                                left_reversed = '1'
                            if seqs[1] < reversed_complement_seqs[1]:
                                right_seq = seqs[1]
                                right_qual = quals[1]
                                right_reversed = '0'
                            else:
                                right_seq = reversed_complement_seqs[1]
                                right_qual = quals[1][::-1]
                                right_reversed = '1'
                            if short_qnames:
                                left_qname_to_write = encode(read_index) + '/1'
                                right_qname_to_write = encode(
                                                            read_index
                                                        ) + '/2'
                            else:
                                left_qname_to_write = original_qnames[0]
                                right_qname_to_write = original_qnames[1]
                            print >>output_stream, '\t'.join(
                                        [
                                            left_seq,
                                            left_reversed,
                                            qname_from_read(
                                                    left_qname_to_write,
                                                    seqs[0] + quals[0], 
                                                    sample_label,
                                                    mate=seqs[1]
                                                ),
                                            '\n'.join([
                                                round_quality_string(
                                                    left_qual
                                                ), right_seq
                                            ]),
                                            right_reversed,
                                            qname_from_read(
                                                    right_qname_to_write,
                                                    seqs[1] + quals[1], 
                                                    sample_label,
                                                    mate=seqs[0]
                                                ),
                                            round_quality_string(right_qual)
                                        ]
                                    )
                            records_printed += 2
                            _output_line_count += 1
                        else:
                            seqs[0] = seqs[0].upper()
                            reversed_complement_seqs = [
                                    seqs[0][::-1].translate(
                                        _reversed_complement_translation_table
                                    )
                                ]
                            # Single-end write
                            if seqs[0] < reversed_complement_seqs[0]:
                                seq = seqs[0]
                                qual = quals[0]
                                is_reversed = '0'
                            else:
                                seq = reversed_complement_seqs[0]
                                qual = quals[0][::-1]
                                is_reversed = '1'
                            if short_qnames:
                                qname_to_write = encode(read_index)
                            else:
                                qname_to_write = original_qnames[0]
                            print >>output_stream, '\t'.join(
                                        [
                                            seq,
                                            is_reversed,
                                            qname_from_read(
                                                qname_to_write,
                                                seqs[0] + quals[0], 
                                                sample_label
                                            ),
                                            round_quality_string(qual)
                                        ]
                                    )
                            records_printed += 1
                            _output_line_count += 1
                        read_index += 1
                        for seq in seqs:
                            nucs_read += len(seq)
                        if records_printed == records_to_consume:
                            break_outer_loop = True
                            perform_push = True
                            break
                        if not to_stdout and not records_to_consume and \
                            nucs_read > nucleotides_per_input:
                            file_number += 1
                            break
                if verbose:
                    print >>sys.stderr, (
                            'Exited with statement; line numbers are %s' 
                            % line_numbers
                        )
                if (not to_stdout) and (push_url.is_nfs or
                    push_url.is_s3 or push_url.is_hdfs) \
                    and ((not records_to_consume) or
                         (records_to_consume and perform_push)):
                    print >>sys.stderr, 'Pushing "%s" to "%s" ...' % (
                                                            output_file,
                                                            push_url.to_url()
                                                        )
                    print >>sys.stderr, 'reporter:status:alive'
                    mover.put(output_file, push_url.plus(os.path.basename(
                                                                output_file
                                                            )))
                    try:
                        os.remove(output_file)
                    except OSError:
                        pass
                if break_outer_loop: break
            if verbose:
                print >>sys.stderr, 'Exiting source streams...'
        if verbose:
            print >>sys.stderr, 'Exited source streams.'
        # Clear temporary directory
        for input_file in os.listdir(temp_dir):
            try:
                os.remove(os.path.join(temp_dir, input_file))
            except OSError:
                pass
        if 'sra_process' in locals():
            sra_process.stdout.close()
            sra_return_code = sra_process.wait()
            if sra_return_code > 0:
                raise RuntimeError(('fastq-dump terminated with exit '
                                    'code %d. Command run was "%s".')
                                        % (sra_return_code,
                                            fastq_dump_command))
            del sra_process
示例#3
0
                                % bowtie_build_thread.bowtie_build_process)
else:
    bowtie_build_process = subprocess.Popen(
                                [args.bowtie2_build_exe,
                                    fasta_file,
                                    index_basename],
                                stderr=sys.stderr,
                                stdout=sys.stderr
                            )
    bowtie_build_process.wait()
    if bowtie_build_process.returncode:
        raise RuntimeError('Bowtie index construction failed w/ exitlevel %d.'
                                % bowtie_build_process.returncode)

# Compress index files
print >>sys.stderr, 'Compressing intron index...'
intron_index_filename = args.basename + '.tar.gz'
intron_index_path = os.path.join(temp_dir_path, intron_index_filename)
index_path = os.path.join(temp_dir_path, 'index')
tar = tarfile.TarFile.gzopen(intron_index_path, mode='w', compresslevel=3)
for index_file in os.listdir(index_path):
    tar.add(os.path.join(index_path, index_file), arcname=index_file)
tar.close()
# Upload compressed index
print >>sys.stderr, 'Uploading or copying compressed index...'
mover = filemover.FileMover(args=args)
mover.put(intron_index_path, output_url.plus(intron_index_filename))

print >>sys.stderr, 'DONE with intron_index.py; in=%d; time=%0.3f s' \
                        % (input_line_count, time.time() - start_time)
示例#4
0
        temp_dir_path = make_temp_dir(tempdel.silentexpandvars(args.scratch))
        register_cleanup(tempdel.remove_temporary_directories,
                            [temp_dir_path])
        output_filename = args.junction_filename + '.temp'
        output_filename = os.path.join(temp_dir_path, output_filename)
    with xopen(True, output_filename, 'w', args.gzip_level) as output_stream:
        for line in sys.stdin:
            tokens = line.strip().split('\t')
            # Remove leading zeros from ints
            print >>output_stream, '\t'.join(
                    [tokens[0], str(int(tokens[1])),
                        str(int(tokens[2]) - 1), tokens[3], tokens[4]]
                )
            input_line_count += 1
else:
    # Default --out is stdout
    for line in sys.stdin:
        tokens = line.strip().split('\t')
        # Remove leading zeros from ints
        print '\t'.join([tokens[0], str(int(tokens[1])),
                                    str(int(tokens[2]) - 1), tokens[3],
                                    tokens[4]])
        input_line_count += 1

if args.out is not None and not output_url.is_local:
    mover = filemover.FileMover(args=args)
    mover.put(output_filename, output_url.plus(args.junction_filename))

print >>sys.stderr, 'DONE with junction_collect.py; in = %d; time=%0.3f s' \
                        % (input_line_count, time.time() - start_time)
示例#5
0
                                        sample_index, span))
            sample_index += span
            reads_assigned += span
            if not (reads_assigned % reads_per_file):
                lines_assigned.append([])
with open(output_path, 'w') as output_stream:
    for line_tuples in lines_assigned:
        if not line_tuples: continue
        print >>output_stream, '\t'.join(('#!splitload', '\x1d'.join(
                        str(line_tuple[-2]) for line_tuple in line_tuples
                    ), 
                    '\x1d'.join(
                        str(line_tuple[-1]) for line_tuple in line_tuples
                    ),
                    '\x1d'.join(
                            '\x1e'.join(samples[line_tuple[0]][1:])
                            for line_tuple in line_tuples
                        ),
                    phred_format
                    ))
    for line in saved:
        print >>output_stream, line.strip()
if not output_url.is_local:
    mover.put(output_path, output_url.plus(args.filename))
    os.remove(output_path)

sys.stdout.flush()
print >>sys.stderr, 'DONE with assign_splits.py; in/out=%d/%d; ' \
        'time=%0.3f s' % (input_line_count, output_line_count,
                            time.time() - start_time)
示例#6
0
文件: bed.py 项目: BenLangmead/rail
                                                ],
                                            start_position,
                                            end_position, i+1,
                                            maximin_overhang,
                                            coverage,
                                            reverse_strand_string,
                                            start_position, end_position,
                                            max_left_overhang,
                                            max_right_overhang,
                                            max_left_overhang + end_pos - pos
                                        )
            input_line_count += i
        else:
            counter.add('insertion_line' if line_type == 'I' else 'deletion_line')
            for i, (rname, pos, end_pos, seq, _, _, _, coverage) \
                in enumerate(xpartition):
                pos, end_pos = int(pos) - 1, int(end_pos) - 1
                print >>output_stream, '%s\t%d\t%d\t%s\t%s' \
                                        % (reference_index.string_to_rname[
                                                rname
                                            ], pos, end_pos, seq, coverage)
            input_line_count += i
    counter.flush()
    if not output_url.is_local:
        counter.add('files_uploaded')
        mover.put(output_path, output_url.plus(output_filename))
        os.remove(output_path)

print >>sys.stderr, 'DONE with bed.py; in=%d; time=%0.3f s' \
                        % (input_line_count, time.time() - start_time)
示例#7
0
文件: bed.py 项目: Honglongwu/rail
                                       '%d,%d\t0,%d' % (
                                            reference_index.string_to_rname[
                                                    rname
                                                ],
                                            start_position,
                                            end_position, i+1,
                                            maximin_overhang,
                                            coverage,
                                            reverse_strand_string,
                                            start_position, end_position,
                                            max_left_overhang,
                                            max_right_overhang,
                                            max_left_overhang + end_pos - pos
                                        )
            input_line_count += i
        else:
            for i, (rname, pos, end_pos, seq, _, _, _, coverage) \
                in enumerate(xpartition):
                pos, end_pos = int(pos) - 1, int(end_pos) - 1
                print >>output_stream, '%s\t%d\t%d\t%s\t%s' \
                                        % (reference_index.string_to_rname[
                                                rname
                                            ], pos, end_pos, seq, coverage)
            input_line_count += i
    if not output_url.is_local:
        mover.put(output_path, output_url.plus(output_filename))
        os.remove(output_path)

print >>sys.stderr, 'DONE with bed.py; in=%d; time=%0.3f s' \
                        % (input_line_count, time.time() - start_time)
示例#8
0
            if not span:
                continue
            lines_assigned[-1].append(
                (current_sample, samples[current_sample][-1], sample_index,
                 span))
            sample_index += span
            reads_assigned += span
            if not (reads_assigned % reads_per_file):
                lines_assigned.append([])
print >> sys.stderr, 'Finished crit block'
with open(output_path, 'w') as output_stream:
    for line_tuples in lines_assigned:
        if not line_tuples: continue
        print >> output_stream, '\t'.join(
            ('#!splitload',
             '\x1d'.join(str(line_tuple[-2]) for line_tuple in line_tuples),
             '\x1d'.join(str(line_tuple[-1]) for line_tuple in line_tuples),
             '\x1d'.join('\x1e'.join(samples[line_tuple[0]][1:])
                         for line_tuple in line_tuples), phred_format))
    for line in saved:
        print >> output_stream, line.strip()
if not output_url.is_local:
    print >> sys.stderr, 'Uploading {} to {}....'.format(output_path, args.out)
    mover.put(output_path, output_url.plus(args.filename))
    os.remove(output_path)

sys.stdout.flush()
print >>sys.stderr, 'DONE with assign_splits.py; in/out=%d/%d; ' \
        'time=%0.3f s' % (input_line_count, output_line_count,
                            time.time() - start_time)
示例#9
0
        args.bigwig_exe, bed_filename, sizes_filename, bigwig_file_paths[0]
    ],
                       [
                           args.bigwig_exe, unique_bed_filename,
                           sizes_filename, bigwig_file_paths[1]
                       ]]
    for i, bigwig_command in enumerate(bigwig_commands):
        if args.verbose:
            print >>sys.stderr, 'Writing bigwig with command %s .' \
                % ' '.join(bigwig_command)
        bedtobigwig_process = subprocess.Popen(bigwig_command,
                                               stderr=sys.stderr,
                                               stdout=sys.stderr,
                                               bufsize=-1)
        bedtobigwig_process.wait()
        if bedtobigwig_process.returncode:
            raise RuntimeError('bedgraphtobigwig process failed w/ '
                               'exitlevel %d.' %
                               bedtobigwig_process.returncode)
        if args.verbose:
            print >> sys.stderr, ('bedTobigwig command %s succeeded .' %
                                  ' '.join(bigwig_command))
        if not output_url.is_local:
            # bigwig must be uploaded to URL and deleted
            mover.put(bigwig_file_paths[i],
                      output_url.plus(bigwig_filenames[i]))
            os.remove(bigwig_file_paths[i])

print >>sys.stderr, 'DONE with coverage.py; in/out=%d/%d; time=%0.3f s' \
                        % (input_line_count, output_line_count,
                            time.time() - start_time)
示例#10
0
                        bigwig_file_paths[1]]]
    for i, bigwig_command in enumerate(bigwig_commands):
        if args.verbose:
            print >>sys.stderr, 'Writing bigwig with command %s .' \
                % ' '.join(bigwig_command)
        counter.add('call_bedgraphtobigwig')
        bedtobigwig_process = subprocess.Popen(
                                    bigwig_command,
                                    stderr=sys.stderr,
                                    stdout=sys.stderr,
                                    bufsize=-1
                                )
        bedtobigwig_process.wait()
        if bedtobigwig_process.returncode:
            raise RuntimeError('bedgraphtobigwig process failed w/ '
                               'exitlevel %d.'
                                % bedtobigwig_process.returncode)
        if args.verbose:
            print >>sys.stderr, ('bedTobigwig command %s succeeded .'
                                  % ' '.join(bigwig_command))
        if not output_url.is_local:
            # bigwig must be uploaded to URL and deleted
            counter.add('files_moved')
            mover.put(bigwig_file_paths[i],
                        output_url.plus(bigwig_filenames[i]))
            os.remove(bigwig_file_paths[i])

print >>sys.stderr, 'DONE with coverage.py; in/out=%d/%d; time=%0.3f s' \
                        % (input_line_count, output_line_count,
                            time.time() - start_time)
示例#11
0
                                % bowtie_build_thread.bowtie_build_process)
else:
    bowtie_build_process = subprocess.Popen(
                                [args.bowtie2_build_exe,
                                    fasta_file,
                                    index_basename],
                                stderr=sys.stderr,
                                stdout=sys.stderr
                            )
    bowtie_build_process.wait()
    if bowtie_build_process.returncode:
        raise RuntimeError('Bowtie index construction failed w/ exitlevel %d.'
                                % bowtie_build_process.returncode)

# Compress index files
print >>sys.stderr, 'Compressing isofrag index...'
junction_index_filename = args.basename + '.tar.gz'
junction_index_path = os.path.join(temp_dir_path, junction_index_filename)
index_path = os.path.join(temp_dir_path, 'index')
tar = tarfile.TarFile.gzopen(junction_index_path, mode='w', compresslevel=3)
for index_file in os.listdir(index_path):
    tar.add(os.path.join(index_path, index_file), arcname=index_file)
tar.close()
# Upload compressed index
print >>sys.stderr, 'Uploading or copying compressed index...'
mover = filemover.FileMover(args=args)
mover.put(junction_index_path, output_url.plus(junction_index_filename))

print >>sys.stderr, 'DONE with junction_index.py; in=%d; time=%0.3f s' \
                        % (input_line_count, time.time() - start_time)
示例#12
0
                            [temp_dir_path])
        output_filename = args.junction_filename + '.temp'
        output_filename = os.path.join(temp_dir_path, output_filename)
    with xopen(True, output_filename, 'w', args.gzip_level) as output_stream:
        for line in sys.stdin:
            counter.add('inputs')
            tokens = line.strip().split('\t')
            # Remove leading zeros from ints
            print >>output_stream, '\t'.join(
                    [tokens[0][:-1], tokens[0][-1], str(int(tokens[1])),
                        str(int(tokens[2]) - 1), tokens[3], tokens[4]]
                )
            input_line_count += 1
else:
    # Default --out is stdout
    for line in sys.stdin:
        counter.add('inputs')
        tokens = line.strip().split('\t')
        # Remove leading zeros from ints
        print '\t'.join([tokens[0], str(int(tokens[1])),
                                    str(int(tokens[2]) - 1), tokens[3],
                                    tokens[4]])
        input_line_count += 1

if args.out is not None and not output_url.is_local:
    mover = filemover.FileMover(args=args)
    mover.put(output_filename, output_url.plus(args.junction_filename))

print >>sys.stderr, 'DONE with junction_collect.py; in = %d; time=%0.3f s' \
                        % (input_line_count, time.time() - start_time)
示例#13
0
     last_output_path = output_path
     move_temporary_file = True
 else:
     tokens = line.rstrip().split('\t')
     sample_index, rname_index, pos, qname, flag = tokens[:5]
     if args.output_by_chromosome:
         (sample_index, rname_index) \
             = sample_and_rname_indexes.sample_and_rname_indexes(
                     sample_index
                 )
     sample_label = manifest_object.index_to_label[sample_index]
     rname = reference_index.string_to_rname[rname_index]
 if move_temporary_file and last_sample_label is not None \
     and not output_url.is_local:
     mover.put(last_output_path, 
         output_url.plus(last_output_filename))
     os.remove(last_output_path)
     if not last_output_path.endswith('.unmapped.bam'):
         mover.put(
             ''.join([last_output_path, '.bai']),
             output_url.plus(''.join([last_output_filename, '.bai']))
         )
         os.remove(''.join([last_output_path, '.bai']))
     move_temporary_file = False
 try:
     if (sample_label != last_sample_label or rname != last_rname
         or not line):
         print 'counts\t-\t%s\t%s\t%d\t%d' % (last_sample_index,
                                              last_rname_index,
                                              total_count, unique_count)
         total_count, unique_count = 0, 0
示例#14
0
    output_url = Url(args.out)
    if output_url.is_local:
        try: os.makedirs(output_url.to_url())
        except: pass
        output_filename = os.path.join(args.out, args.intron_filename)
    else:
        temp_dir_path = make_temp_dir(tempdel.silentexpandvars(args.scratch))
        register_cleanup(tempdel.remove_temporary_directories,
                            [temp_dir_path])
        output_filename = args.intron_filename + '.temp'
        output_filename = os.path.join(temp_dir_path, output_filename)
    with xopen(True, output_filename, 'w', args.gzip_level) as output_stream:
        for line in sys.stdin:
            output_stream.write(line)
            input_line_count += 1
else:
    # Default --out is stdout
    for line in sys.stdin:
        tokens = line.strip().split('\t')
        # Remove leading zeros from ints
        sys.stdout.write('\t'.join([tokens[0], str(int(tokens[1])),
                                    str(int(tokens[2])), tokens[3],
                                    tokens[4]]))
        input_line_count += 1

if args.out is not None and not output_url.is_local:
    mover = filemover.FileMover(args=args)
    mover.put(output_filename, output_url.plus(args.intron_filename))

print >>sys.stderr, 'DONE with intron_collect.py; in = %d; time=%0.3f s' \
                        % (input_line_count, time.time() - start_time)
示例#15
0
        raise RuntimeError(
            'Bowtie index construction failed w/ exitlevel %d.' %
            bowtie_build_thread.bowtie_build_process)
else:
    bowtie_build_process = subprocess.Popen(
        [args.bowtie2_build_exe, fasta_file, index_basename],
        stderr=sys.stderr,
        stdout=sys.stderr)
    bowtie_build_process.wait()
    if bowtie_build_process.returncode:
        raise RuntimeError(
            'Bowtie index construction failed w/ exitlevel %d.' %
            bowtie_build_process.returncode)

# Compress index files
print >> sys.stderr, 'Compressing intron index...'
intron_index_filename = args.basename + '.tar.gz'
intron_index_path = os.path.join(temp_dir_path, intron_index_filename)
index_path = os.path.join(temp_dir_path, 'index')
tar = tarfile.TarFile.gzopen(intron_index_path, mode='w', compresslevel=3)
for index_file in os.listdir(index_path):
    tar.add(os.path.join(index_path, index_file), arcname=index_file)
tar.close()
# Upload compressed index
print >> sys.stderr, 'Uploading or copying compressed index...'
mover = filemover.FileMover(args=args)
mover.put(intron_index_path, output_url.plus(intron_index_filename))

print >>sys.stderr, 'DONE with intron_index.py; in=%d; time=%0.3f s' \
                        % (input_line_count, time.time() - start_time)