Exemplo n.º 1
0
def test_async_mapper_execution():
    input = files.open(testfiles["reads_1.fastq"])
    mappings = gem.mapper(input, index)
    assert mappings is not None
    assert mappings.process is not None
    assert mappings.filename is None
    assert sum(1 for x in mappings) == 10000
Exemplo n.º 2
0
def gem_mapping(gem_index_path, fastq_path, out_map_path, **kwargs):
    """
    :param None focus: trims the sequence in the input FASTQ file according to a
       (start, end) position, or the name of a restriction enzyme. By default it
       uses the full sequence.
    :param 33 quality: set it to 'ignore' in order to speed-up the mapping
    """
    gem_index_path    = os.path.abspath(os.path.expanduser(gem_index_path))
    fastq_path        = os.path.abspath(os.path.expanduser(fastq_path))
    out_map_path      = os.path.abspath(os.path.expanduser(out_map_path))
    nthreads          = kwargs.get('nthreads'            , 8)
    max_edit_distance = kwargs.get('max_edit_distance'   , 0.04)
    mismatches        = kwargs.get('mismatches'          , 0.04)
    quality           = kwargs.get('quality'             , 33)

    # check kwargs
    for kw in kwargs:
        if not kw in ['nthreads', 'max_edit_distance',
                      'mismatches', 'max_reads_per_chunk',
                      'out_files', 'temp_dir']:
            warn('WARNING: %s not is usual keywords, misspelled?' % kw)

    # input
    inputf = gem.files.open(fastq_path)

    # mapping
    print 'TO GEM', fastq_path
    return gem.mapper(inputf, gem_index_path, min_decoded_strata=0,
                      max_decoded_matches=1, unique_mapping=False,
                      max_edit_distance=max_edit_distance,
                      mismatches=mismatches, quality=quality,
                      output=out_map_path,
                      threads=nthreads)
Exemplo n.º 3
0
def test_quality_pass_on_execution():
    input = files.open(testfiles["reads_1.fastq"])
    mappings = gem.mapper(input,
                          index,
                          output=results_dir + "/quality_passon_mapping.map")
    assert mappings.quality == "offset-33", "Quality should be 'offset-33' but is %s" % (
        str(mappings.quality))
Exemplo n.º 4
0
def test_interleaved_pair_aligner_run():
    input1 = files.open(testfiles["reads_1.fastq"])
    input2 = files.open(testfiles["reads_2.fastq"])
    mappings = gem.mapper(filter.interleave([input1, input2]), index)
    paired = gem.pairalign(mappings, index)
    assert paired is not None
    assert sum(1 for x in paired) == 20000  # test dataset does not pair at all
Exemplo n.º 5
0
def test_interleaved_pair_aligner_run():
    input1 = files.open(testfiles["reads_1.fastq"])
    input2 = files.open(testfiles["reads_2.fastq"])
    mappings = gem.mapper(filter.interleave([input1, input2]), index)
    paired = gem.pairalign(mappings, index)
    assert paired is not None
    assert sum(1 for x in paired) == 20000  # test dataset does not pair at all
Exemplo n.º 6
0
def test_async_mapper_execution():
    input = files.open(testfiles["reads_1.fastq"])
    mappings = gem.mapper(input, index)
    assert mappings is not None
    assert mappings.process is not None
    assert mappings.filename is None
    assert sum(1 for x in mappings) == 10000
Exemplo n.º 7
0
def gem_mapping(gem_index_path, fastq_path, out_map_path, **kwargs):
    """
    :param None focus: trims the sequence in the input FASTQ file according to a
       (start, end) position, or the name of a restriction enzyme. By default it
       uses the full sequence.
    :param 33 quality: set it to 'ignore' in order to speed-up the mapping
    """
    gem_index_path    = os.path.abspath(os.path.expanduser(gem_index_path))
    fastq_path        = os.path.abspath(os.path.expanduser(fastq_path))
    out_map_path      = os.path.abspath(os.path.expanduser(out_map_path))
    nthreads          = kwargs.get('nthreads'            , 8)
    max_edit_distance = kwargs.get('max_edit_distance'   , 0.04)
    mismatches        = kwargs.get('mismatches'          , 0.04)
    quality           = kwargs.get('quality'             , 33)

    # check kwargs
    for kw in kwargs:
        if not kw in ['nthreads', 'max_edit_distance',
                      'mismatches', 'max_reads_per_chunk',
                      'out_files', 'temp_dir']:
            warn('WARNING: %s not is usual keywords, misspelled?' % kw)

    # input
    inputf = gem.files.open(fastq_path)

    # mapping
    print 'TO GEM', fastq_path
    return gem.mapper(inputf, gem_index_path, min_decoded_strata=0,
                      max_decoded_matches=1, unique_mapping=False,
                      max_edit_distance=max_edit_distance,
                      mismatches=mismatches, quality=quality,
                      output=out_map_path,
                      threads=nthreads)
Exemplo n.º 8
0
def test_sync_mapper_execution():
    input = files.open(testfiles["reads_1.fastq"])
    mappings = gem.mapper(input, index, results_dir + "/result.mapping")
    assert mappings is not None
    assert mappings.process is not None
    assert mappings.filename is not None
    assert mappings.filename == results_dir + "/result.mapping"
    assert sum(1 for x in mappings) == 10000
Exemplo n.º 9
0
def test_sync_mapper_execution():
    input = files.open(testfiles["reads_1.fastq"])
    mappings = gem.mapper(input, index, results_dir + "/result.mapping")
    assert mappings is not None
    assert mappings.process is not None
    assert mappings.filename is not None
    assert mappings.filename == results_dir + "/result.mapping"
    assert sum(1 for x in mappings) == 10000
def test_gem2sam_execution_to_file():
    input = gem.files.open(testfiles["reads_1.fastq"])
    mappings = gem.mapper(input, index)
    result = results_dir + "/test_sam.sam"
    sam = gem.gem2sam(mappings, index, output=result, compact=True)
    assert sam is not None
    assert sam.process is not None
    assert sam.filename == result
    assert os.path.exists(result)
Exemplo n.º 11
0
def test_interleaved_mapper_run():
    input1 = files.open(testfiles["reads_1.fastq"])
    input2 = files.open(testfiles["reads_2.fastq"])

    mappings = gem.mapper(filter.interleave([input1, input2]), index)
    assert mappings is not None
    assert mappings.process is not None
    assert mappings.filename is None
    assert sum(1 for x in mappings) == 20000
Exemplo n.º 12
0
def test_gem2sam_execution_to_file():
    input = gem.files.open(testfiles["reads_1.fastq"])
    mappings = gem.mapper(input, index)
    result = results_dir + "/test_sam.sam"
    sam = gem.gem2sam(mappings, index, output=result, compact=True)
    assert sam is not None
    assert sam.process is not None
    assert sam.filename == result
    assert os.path.exists(result)
Exemplo n.º 13
0
def test_interleaved_mapper_run():
    input1 = files.open(testfiles["reads_1.fastq"])
    input2 = files.open(testfiles["reads_2.fastq"])

    mappings = gem.mapper(filter.interleave([input1, input2]), index)
    assert mappings is not None
    assert mappings.process is not None
    assert mappings.filename is None
    assert sum(1 for x in mappings) == 20000
Exemplo n.º 14
0
def test_gem2sam_sam2bam():
    input = gem.files.open(testfiles["reads_1.fastq"])
    mappings = gem.mapper(input, index)
    sam = gem.gem2sam(mappings, index, compact=True)
    result = results_dir + "/test_sam.bam"
    bam = gem.sam2bam(sam, output=result)
    assert os.path.exists(result)
    count = 0
    for l in gem.files.open(result):
        count += 1
    assert count == 10000, "Count 10000!=%d" % count
Exemplo n.º 15
0
def test_gem2sam_execution():
    input = files.open(testfiles["reads_1.fastq"])
    mappings = gem.mapper(input, index)
    sam = gem.gem2sam(mappings, index, compact=True)
    assert sam is not None
    assert sam.process is not None
    assert sam.filename is None
    count = 0
    for read in sam:
        count += 1
    assert count == 10000
Exemplo n.º 16
0
def test_gem2sam_execution():
    input = files.open(testfiles["reads_1.fastq"])
    mappings = gem.mapper(input, index)
    sam = gem.gem2sam(mappings, index, compact=True)
    assert sam is not None
    assert sam.process is not None
    assert sam.filename is None
    count = 0
    for read in sam:
        count += 1
    assert count == 10000
def test_gem2sam_sam2bam():
    input = gem.files.open(testfiles["reads_1.fastq"])
    mappings = gem.mapper(input, index)
    sam = gem.gem2sam(mappings, index, compact=True)
    result = results_dir+"/test_sam.bam"
    bam = gem.sam2bam(sam, output=result)
    assert os.path.exists(result)
    count = 0
    for l in gem.files.open(result):
        count += 1
    assert count == 10000, "Count 10000!=%d" % count
Exemplo n.º 18
0
def test_quality_pass_on_execution():
    input = files.open(testfiles["reads_1.fastq"])
    mappings = gem.mapper(input, index, output=results_dir+"/quality_passon_mapping.map")
    assert mappings.quality == "offset-33", "Quality should be 'offset-33' but is %s" % (str(mappings.quality))
Exemplo n.º 19
0
    # output
    local_out_sam = out_sam_path + '.%d:%d-%d' % (
        N_WINDOWS - len(range_stop), seq_beg, seq_end)
    out_files.append(local_out_sam)
    # input
    inputf = gem.files.open(fastq_path)

    # trimming
    trimmed = gem.filter.run_filter(
        inputf, ['--hard-trim', '%d,%d' % (trim_5, trim_3)],
        threads=nthreads, paired=not single_end)

    # mapping
    mapped = gem.mapper(trimmed, gem_index_path, min_decoded_strata=0,
                        max_decoded_matches=2, unique_mapping=False,
                        max_edit_distance=max_edit_distance,
                        mismatches=mismatches,
                        output=temp_dir + '/test.map',
                        threads=nthreads)

    # convert to sam/bam
    if output_is_bam:
        sam = gem.gem2sam(mapped, index=gem_index_path, threads=nthreads,
                          single_end=single_end)
        _ = gem.sam2bam(sam, output=local_out_sam, threads=nthreads)
    else:
        sam = gem.gem2sam(mapped, index=gem_index_path, output=local_out_sam,
                          threads=nthreads, single_end=single_end)

    # Recursively go to the next iteration.
    unmapped_fastq_path = os.path.split(fastq_path)[1]
    if unmapped_fastq_path[-1].isdigit():
Exemplo n.º 20
0
    # output
    local_out_sam = out_sam_path + '.%d' % (seq_len)
    out_files.append(local_out_sam)
    # input
    inputf = gem.files.open(fastq_path)

    # trimming
    trimmed = gem.filter.run_filter(
        inputf, ['--hard-trim', '%d,%d' % (trim_5, trim_3)],
        threads=nthreads, paired=not single_end)
    
    # mapping
    mapped = gem.mapper(trimmed, gem_index_path, min_decoded_strata=0,
                        max_decoded_matches=2, unique_mapping=False,
                        max_edit_distance=max_edit_distance,
                        mismatches=mismatches,
                        output=temp_dir + '/test.map',
                        threads=nthreads)

    # convert to sam
    sam = gem.gem2sam(mapped, index=gem_index_path, output=local_out_sam,
                      threads=nthreads, single_end=single_end)
    if output_is_bam:
        sam = gem.gem2sam(mapped, index=gem_index_path, threads=nthreads,
                          single_end=single_end)
        _ = gem.sam2bam(sam, output=local_out_sam, threads=nthreads)
    else:
        sam = gem.gem2sam(mapped, index=gem_index_path, output=local_out_sam,
                          threads=nthreads, single_end=single_end)

    # Recursively go to the next iteration.
Exemplo n.º 21
0
def iterative_mapping(gem_index_path, fastq_path, out_sam_path,
                      range_start, range_stop, **kwargs):
    """
    Map iteratively a given FASTQ file to a reference genome.
    
    :param gem_index_path: path to index file created from a reference genome
       using gem-index tool
    :param fastq_path: PATH to fastq file, either compressed or not.
    :param out_sam_path: path to a directory where to store mapped reads in SAM/
       BAM format (see option output_is_bam).
    :param range_start: list of integers representing the start position of each
       read fragment to be mapped (starting at 1 includes the first nucleotide
       of the read).
    :param range_stop: list of integers representing the end position of each
       read fragment to be mapped.
    :param True single_end: when FASTQ contains paired-ends flags
    :param 4 nthreads: number of threads to use for mapping (number of CPUs)
    :param 0.04 max_edit_distance: The maximum number of edit operations allowed
       while verifying candidate matches by dynamic programming.
    :param 0.04 mismatches: The maximum number of nucleotide substitutions
       allowed while mapping each k-mer. It is always guaranteed that, however
       other options are chosen, all the matches up to the specified number of
       substitutions will be found by the program.
    :param -1 max_reads_per_chunk: maximum number of reads to process at a time.
       If -1, all reads will be processed in one run (more RAM memory needed).
    :param False output_is_bam: Use binary (compressed) form of generated
       out-files with mapped reads (recommended to save disk space).
    :param /tmp temp_dir: important to change. Intermediate FASTQ files will be
       written there.

    :returns: a list of paths to generated outfiles. To be passed to 
       :func:`pytadbit.parsers.sam_parser.parse_sam`
    """
    gem_index_path      = os.path.abspath(os.path.expanduser(gem_index_path))
    fastq_path          = os.path.abspath(os.path.expanduser(fastq_path))
    out_sam_path        = os.path.abspath(os.path.expanduser(out_sam_path))
    single_end          = kwargs.get('single_end'          , True)
    max_edit_distance   = kwargs.get('max_edit_distance'   , 0.04)
    mismatches          = kwargs.get('mismatches'          , 0.04)
    nthreads            = kwargs.get('nthreads'            , 4)
    max_reads_per_chunk = kwargs.get('max_reads_per_chunk' , -1)
    out_files           = kwargs.get('out_files'           , [])
    output_is_bam       = kwargs.get('output_is_bam'       , False)
    temp_dir = os.path.abspath(os.path.expanduser(
        kwargs.get('temp_dir', tempfile.gettempdir())))

    # check kwargs
    for kw in kwargs:
        if not kw in ['single_end', 'nthreads', 'max_edit_distance',
                      'mismatches', 'max_reads_per_chunk',
                      'out_files', 'output_is_bam', 'temp_dir']:
            warn('WARNING: %s not is usual keywords, misspelled?' % kw)
    
    # check windows:
    if not isinstance(range_start, list) or not isinstance(range_stop, list):
        if (not isinstance(range_start, tuple) or
            not isinstance(range_stop, tuple)):
            raise Exception('ERROR: range_start and range_stop should be lists')
        range_start = list(range_start)
        range_stop  = list(range_stop)
    if (not all(isinstance(i, int) for i in range_start) or
        not all(isinstance(i, int) for i in range_stop)):
        try:
            range_start = map(int, range_start)
            range_stop  = map(int, range_stop)            
            warn('WARNING: range_start and range_stop converted to integers')
        except ValueError:
            raise Exception('ERROR: range_start and range_stop should contain' +
                            ' integers only')
    if (len(zip(range_start, range_stop)) < len(range_start) or
        len(range_start) != len(range_stop)):
        raise Exception('ERROR: range_start and range_stop should have the ' +
                        'same sizes and windows should be uniques.')
    if any([i >= j for i, j in zip(range_start, range_stop)]):
        raise Exception('ERROR: start positions should always be lower than ' +
                        'stop positions.')
    if any([i <= 0 for i in range_start]):
        raise Exception('ERROR: start positions should be strictly positive.')

    # create directories
    for rep in [temp_dir, os.path.split(out_sam_path)[0]]:
        mkdir(rep)

    #get the length of a read
    if fastq_path.endswith('.gz'):
        fastqh = gzip.open(fastq_path)
    else:
        fastqh = open(fastq_path)
    # get the length from the length of the second line, which is the sequence
    # can not use the "length" keyword, as it is not always present
    try:
        _ = fastqh.next()
        raw_seq_len = len(fastqh.next().strip())
        fastqh.close()
    except StopIteration:
        raise IOError('ERROR: problem reading %s\n' % fastq_path)

    if not  N_WINDOWS:
        N_WINDOWS = len(range_start)
    # Split input files if required and apply iterative mapping to each
    # segment separately.
    if max_reads_per_chunk > 0:
        kwargs['max_reads_per_chunk'] = -1
        print 'Split input file %s into chunks' % fastq_path
        chunked_files = _chunk_file(
            fastq_path,
            os.path.join(temp_dir, os.path.split(fastq_path)[1]),
            max_reads_per_chunk * 4)
        print '%d chunks obtained' % len(chunked_files)
        for i, fastq_chunk_path in enumerate(chunked_files):
            global N_WINDOWS
            N_WINDOWS = 0
            print 'Run iterative_mapping recursively on %s' % fastq_chunk_path
            out_files.extend(iterative_mapping(
                gem_index_path, fastq_chunk_path,
                out_sam_path + '.%d' % (i + 1), range_start[:], range_stop[:],
                **kwargs))

        for i, fastq_chunk_path in enumerate(chunked_files):
            # Delete chunks only if the file was really chunked.
            if len(chunked_files) > 1:
                print 'Remove the chunks: %s' % ' '.join(chunked_files)
                os.remove(fastq_chunk_path)
        return out_files

    # end position according to sequence in the file
    # removes 1 in order to start at 1 instead of 0
    try:
        seq_end = range_stop.pop(0)
        seq_beg = range_start.pop(0)
    except IndexError:
        return out_files

    # define what we trim
    seq_len = seq_end - seq_beg
    trim_5, trim_3 = trimming(raw_seq_len, seq_beg - 1, seq_len - 1)

    # output
    local_out_sam = out_sam_path + '.%d:%d-%d' % (
        N_WINDOWS - len(range_stop), seq_beg, seq_end)
    out_files.append(local_out_sam)
    # input
    inputf = gem.files.open(fastq_path)

    # trimming
    trimmed = gem.filter.run_filter(
        inputf, ['--hard-trim', '%d,%d' % (trim_5, trim_3)],
        threads=nthreads, paired=not single_end)

    # mapping
    mapped = gem.mapper(trimmed, gem_index_path, min_decoded_strata=0,
                        max_decoded_matches=2, unique_mapping=False,
                        max_edit_distance=max_edit_distance,
                        mismatches=mismatches,
                        output=temp_dir + '/test.map',
                        threads=nthreads)

    # convert to sam/bam
    if output_is_bam:
        sam = gem.gem2sam(mapped, index=gem_index_path, threads=nthreads,
                          single_end=single_end)
        _ = gem.sam2bam(sam, output=local_out_sam, threads=nthreads)
    else:
        sam = gem.gem2sam(mapped, index=gem_index_path, output=local_out_sam,
                          threads=nthreads, single_end=single_end)

    # Recursively go to the next iteration.
    unmapped_fastq_path = os.path.split(fastq_path)[1]
    if unmapped_fastq_path[-1].isdigit():
        unmapped_fastq_path = unmapped_fastq_path.rsplit('.', 1)[0]
    unmapped_fastq_path = os.path.join(
        temp_dir, unmapped_fastq_path + '.%d:%d-%d' % (
            N_WINDOWS - len(range_stop), seq_beg, seq_end))
    _filter_unmapped_fastq(fastq_path, local_out_sam, unmapped_fastq_path)

    out_files.extend(iterative_mapping(gem_index_path, unmapped_fastq_path,
                                       out_sam_path,
                                       range_start, range_stop, **kwargs))
    os.remove(unmapped_fastq_path)
    return out_files
Exemplo n.º 22
0
def iterative_mapping(gem_index_path, fastq_path, out_sam_path, range_start,
                      range_stop, **kwargs):
    """
    Map iteratively a given FASTQ file to a reference genome.
    
    :param gem_index_path: path to index file created from a reference genome
       using gem-index tool
    :param fastq_path: PATH to fastq file, either compressed or not.
    :param out_sam_path: path to a directory where to store mapped reads in SAM/
       BAM format (see option output_is_bam).
    :param range_start: list of integers representing the start position of each
       read fragment to be mapped (starting at 1 includes the first nucleotide
       of the read).
    :param range_stop: list of integers representing the end position of each
       read fragment to be mapped.
    :param True single_end: when FASTQ contains paired-ends flags
    :param 4 nthreads: number of threads to use for mapping (number of CPUs)
    :param 0.04 max_edit_distance: The maximum number of edit operations allowed
       while verifying candidate matches by dynamic programming.
    :param 0.04 mismatches: The maximum number of nucleotide substitutions
       allowed while mapping each k-mer. It is always guaranteed that, however
       other options are chosen, all the matches up to the specified number of
       substitutions will be found by the program.
    :param -1 max_reads_per_chunk: maximum number of reads to process at a time.
       If -1, all reads will be processed in one run (more RAM memory needed).
    :param False output_is_bam: Use binary (compressed) form of generated
       out-files with mapped reads (recommended to save disk space).
    :param /tmp temp_dir: important to change. Intermediate FASTQ files will be
       written there.

    :returns: a list of paths to generated outfiles. To be passed to 
       :func:`pytadbit.parsers.sam_parser.parse_sam`
    """
    gem_index_path = os.path.abspath(os.path.expanduser(gem_index_path))
    fastq_path = os.path.abspath(os.path.expanduser(fastq_path))
    out_sam_path = os.path.abspath(os.path.expanduser(out_sam_path))
    single_end = kwargs.get('single_end', True)
    max_edit_distance = kwargs.get('max_edit_distance', 0.04)
    mismatches = kwargs.get('mismatches', 0.04)
    nthreads = kwargs.get('nthreads', 4)
    max_reads_per_chunk = kwargs.get('max_reads_per_chunk', -1)
    out_files = kwargs.get('out_files', [])
    output_is_bam = kwargs.get('output_is_bam', False)
    temp_dir = os.path.abspath(
        os.path.expanduser(kwargs.get('temp_dir', tempfile.gettempdir())))

    # check kwargs
    for kw in kwargs:
        if not kw in [
                'single_end', 'nthreads', 'max_edit_distance', 'mismatches',
                'max_reads_per_chunk', 'out_files', 'output_is_bam', 'temp_dir'
        ]:
            warn('WARNING: %s not is usual keywords, misspelled?' % kw)

    # check windows:
    if not isinstance(range_start, list) or not isinstance(range_stop, list):
        if (not isinstance(range_start, tuple)
                or not isinstance(range_stop, tuple)):
            raise Exception(
                'ERROR: range_start and range_stop should be lists')
        range_start = list(range_start)
        range_stop = list(range_stop)
    if (not all(isinstance(i, int) for i in range_start)
            or not all(isinstance(i, int) for i in range_stop)):
        try:
            range_start = map(int, range_start)
            range_stop = map(int, range_stop)
            warn('WARNING: range_start and range_stop converted to integers')
        except ValueError:
            raise Exception(
                'ERROR: range_start and range_stop should contain' +
                ' integers only')
    if (len(zip(range_start, range_stop)) < len(range_start)
            or len(range_start) != len(range_stop)):
        raise Exception('ERROR: range_start and range_stop should have the ' +
                        'same sizes and windows should be uniques.')
    if any([i >= j for i, j in zip(range_start, range_stop)]):
        raise Exception('ERROR: start positions should always be lower than ' +
                        'stop positions.')
    if any([i <= 0 for i in range_start]):
        raise Exception('ERROR: start positions should be strictly positive.')

    # create directories
    for rep in [temp_dir, os.path.split(out_sam_path)[0]]:
        mkdir(rep)

    #get the length of a read
    if fastq_path.endswith('.gz'):
        fastqh = gzip.open(fastq_path)
    else:
        fastqh = open(fastq_path)
    # get the length from the length of the second line, which is the sequence
    # can not use the "length" keyword, as it is not always present
    try:
        _ = fastqh.next()
        raw_seq_len = len(fastqh.next().strip())
        fastqh.close()
    except StopIteration:
        raise IOError('ERROR: problem reading %s\n' % fastq_path)

    if not N_WINDOWS:
        N_WINDOWS = len(range_start)
    # Split input files if required and apply iterative mapping to each
    # segment separately.
    if max_reads_per_chunk > 0:
        kwargs['max_reads_per_chunk'] = -1
        print 'Split input file %s into chunks' % fastq_path
        chunked_files = _chunk_file(
            fastq_path, os.path.join(temp_dir,
                                     os.path.split(fastq_path)[1]),
            max_reads_per_chunk * 4)
        print '%d chunks obtained' % len(chunked_files)
        for i, fastq_chunk_path in enumerate(chunked_files):
            global N_WINDOWS
            N_WINDOWS = 0
            print 'Run iterative_mapping recursively on %s' % fastq_chunk_path
            out_files.extend(
                iterative_mapping(gem_index_path, fastq_chunk_path,
                                  out_sam_path + '.%d' % (i + 1),
                                  range_start[:], range_stop[:], **kwargs))

        for i, fastq_chunk_path in enumerate(chunked_files):
            # Delete chunks only if the file was really chunked.
            if len(chunked_files) > 1:
                print 'Remove the chunks: %s' % ' '.join(chunked_files)
                os.remove(fastq_chunk_path)
        return out_files

    # end position according to sequence in the file
    # removes 1 in order to start at 1 instead of 0
    try:
        seq_end = range_stop.pop(0)
        seq_beg = range_start.pop(0)
    except IndexError:
        return out_files

    # define what we trim
    seq_len = seq_end - seq_beg
    trim_5, trim_3 = trimming(raw_seq_len, seq_beg - 1, seq_len - 1)

    # output
    local_out_sam = out_sam_path + '.%d:%d-%d' % (N_WINDOWS - len(range_stop),
                                                  seq_beg, seq_end)
    out_files.append(local_out_sam)
    # input
    inputf = gem.files.open(fastq_path)

    # trimming
    trimmed = gem.filter.run_filter(
        inputf, ['--hard-trim', '%d,%d' % (trim_5, trim_3)],
        threads=nthreads,
        paired=not single_end)

    # mapping
    mapped = gem.mapper(trimmed,
                        gem_index_path,
                        min_decoded_strata=0,
                        max_decoded_matches=2,
                        unique_mapping=False,
                        max_edit_distance=max_edit_distance,
                        mismatches=mismatches,
                        output=temp_dir + '/test.map',
                        threads=nthreads)

    # convert to sam/bam
    if output_is_bam:
        sam = gem.gem2sam(mapped,
                          index=gem_index_path,
                          threads=nthreads,
                          single_end=single_end)
        _ = gem.sam2bam(sam, output=local_out_sam, threads=nthreads)
    else:
        sam = gem.gem2sam(mapped,
                          index=gem_index_path,
                          output=local_out_sam,
                          threads=nthreads,
                          single_end=single_end)

    # Recursively go to the next iteration.
    unmapped_fastq_path = os.path.split(fastq_path)[1]
    if unmapped_fastq_path[-1].isdigit():
        unmapped_fastq_path = unmapped_fastq_path.rsplit('.', 1)[0]
    unmapped_fastq_path = os.path.join(
        temp_dir, unmapped_fastq_path + '.%d:%d-%d' %
        (N_WINDOWS - len(range_stop), seq_beg, seq_end))
    _filter_unmapped_fastq(fastq_path, local_out_sam, unmapped_fastq_path)

    out_files.extend(
        iterative_mapping(gem_index_path, unmapped_fastq_path, out_sam_path,
                          range_start, range_stop, **kwargs))
    os.remove(unmapped_fastq_path)
    return out_files
Exemplo n.º 23
0
    paired_out = "%s_paired.map" % name
    # scored mappings
    final_out = "%s.map" % name
    # sam/bam output
    sam_out = "%s.bam" % name

    ## Create initial mapping
    # we deal with a single file with interleaved paired reads here, but
    # creating input from two files for the read pairs is straight forward using the interleave filter
    #
    # input_1 = gem.files.open(input_file)
    # input_2 = gem.files.open(input_file2)
    # input = gem.filter.interleave([input_1, input_2])
    print "Running initial mapping"
    input = gem.files.open(reads)
    initial_mapping = gem.mapper(input, index, initial_out, mismatches=0.07, delta=1, threads=THREADS)

    ## junction sites
    # before we can do the split mapping, we have to load
    # the junction sites from a gtf annotation and
    # run the denovo-junction detection. This will also give
    # us a mapping that preserves short indels detected during the
    # extraction run.
    print "Loading GTF junctions from %s" % annotation
    junctions = gem.junctions.from_gtf(annotation)

    # now the denovo run. This returns a tuple : (mapping, junctions)
    # and here we use the merge_with parameter to merge the denovo junctions with
    # the previously loaded gtf junctions.
    #
    # Also note that we pass only unmapped reads from the initial mapping to the junction