Python parse_sam示例，pytadbit.parsers.sam_parser.parse_sam Python示例

示例#1

0

显示文件

文件： tadbit_parse.py 项目： aescrdni/TADbit

def run(opts):
    check_options(opts)

    launch_time = time.localtime()

    reads = [1] if opts.read == 1 else [2] if opts.read == 2 else [1, 2]
    if not opts.mapped1 and not opts.mapped2:
        f_names1, f_names2, renz = load_parameters_fromdb(
            opts, reads, opts.jobids)
    else:
        if opts.mapped1:
            f_names1 = opts.mapped1
        if opts.mapped2:
            f_names2 = opts.mapped2
        renz = opts.renz

    renz = renz.split('-')

    opts.workdir = path.abspath(opts.workdir)

    name = path.split(opts.workdir)[-1]

    param_hash = digest_parameters(opts)

    outdir = '02_parsed_reads'

    mkdir(path.join(opts.workdir, outdir))

    if not opts.read:
        out_file1 = path.join(opts.workdir, outdir,
                              '%s_r1_%s.tsv' % (name, param_hash))
        out_file2 = path.join(opts.workdir, outdir,
                              '%s_r2_%s.tsv' % (name, param_hash))
    elif opts.read == 1:
        out_file1 = path.join(opts.workdir, outdir,
                              '%s_r1_%s.tsv' % (name, param_hash))
        out_file2 = None
        f_names2 = None
    elif opts.read == 2:
        out_file2 = None
        f_names1 = f_names2
        f_names2 = None
        out_file1 = path.join(opts.workdir, outdir,
                              '%s_r2_%s.tsv' % (name, param_hash))

    logging.info('parsing genomic sequence')
    try:
        # allows the use of pickle genome to make it faster
        genome = load(open(opts.genome[0], 'rb'))
    except (UnpicklingError, KeyError):
        genome = parse_fasta(opts.genome, chr_regexp=opts.filter_chrom)

    if not opts.skip:
        logging.info('parsing reads in %s project', name)
        if opts.mapped1 or opts.mapped2:
            counts, multis = parse_sam(f_names1,
                                       f_names2,
                                       out_file1=out_file1,
                                       out_file2=out_file2,
                                       re_name=renz,
                                       verbose=True,
                                       genome_seq=genome,
                                       compress=opts.compress_input)
        else:
            counts, multis = parse_map(f_names1,
                                       f_names2,
                                       out_file1=out_file1,
                                       out_file2=out_file2,
                                       re_name=renz,
                                       verbose=True,
                                       genome_seq=genome,
                                       compress=opts.compress_input)
    else:
        counts = {}
        counts[0] = {}
        fhandler = open(out_file1)
        for line in fhandler:
            if line.startswith('# MAPPED '):
                _, _, item, value = line.split()
                counts[0][item] = int(value)
            elif not line.startswith('#'):
                break
        multis = {}
        multis[0] = {}
        for line in fhandler:
            if '|||' in line:
                try:
                    multis[0][line.count('|||')] += 1
                except KeyError:
                    multis[0][line.count('|||')] = 1
        if out_file2:
            counts[1] = {}
            fhandler = open(out_file2)
            for line in fhandler:
                if line.startswith('# MAPPED '):
                    _, _, item, value = line.split()
                    counts[1][item] = int(value)
                elif not line.startswith('#'):
                    break
            multis[1] = 0
            for line in fhandler:
                if '|||' in line:
                    multis[1] += line.count('|||')

    # write machine log
    while path.exists(path.join(opts.workdir, '__lock_log')):
        time.sleep(0.5)
    open(path.join(opts.workdir, '__lock_log'), 'a').close()
    with open(path.join(opts.workdir, 'trace.log'), "a") as mlog:
        for read in counts:
            for item in counts[read]:
                mlog.write('# PARSED READ%s PATH\t%d\t%s\n' %
                           (read, counts[read][item],
                            out_file1 if read == 1 else out_file2))
    # release lock
    try:
        remove(path.join(opts.workdir, '__lock_log'))
    except OSError:
        pass

    finish_time = time.localtime()

    # save all job information to sqlite DB
    save_to_db(opts, counts, multis, f_names1, f_names2, out_file1, out_file2,
               launch_time, finish_time)

示例#2

0

显示文件

文件： make_matrices.py 项目： sidorov-si/TADStates

def make_matrices(left_reads_fastq, right_reads_fastq, reads_fastq, genome_fasta, genome_index, \
                  output_directory, output_prefix, enzyme, res, chromosomes, threads_number, \
                  clean_tmp, tmp_dir):

    print 'Begin to process reads.'

    left_reads = ''
    right_reads = ''
    if reads_fastq != '': # left and right reads are stored in one file
        range_start_left, range_stop_left, \
        range_start_right, range_stop_right = calc_left_right_ranges(reads_fastq)
        print 'Reads:                     ', reads_fastq
        left_reads = reads_fastq
        right_reads = reads_fastq
    else: # left and right reads are stored separately
        range_start_left, range_stop_left, \
        range_start_right, range_stop_right = calc_range(left_reads_fastq)
        print 'Left reads:                ', left_reads_fastq
        print 'Right reads:               ', right_reads_fastq
        print 'Output prefix:             ', output_prefix
        left_reads = left_reads_fastq
        right_reads = right_reads_fastq

    print 'Reference genome FASTA:    ', genome_fasta
    print 'Reference genome GEM index:', genome_index
    print 'Output directory:          ', output_directory
    print 'Temp directory:            ', tmp_dir
    print 'Enzyme:                    ', enzyme
    print 'Resolution:                ', res, 'bp'
    print 'Number of threads:         ', threads_number
    print 'Start pos for left reads:  ', range_start_left
    print 'Stop pos for left reads:   ', range_stop_left
    print 'Start pos for right reads: ', range_start_right
    print 'Stop pos for right reads:  ', range_stop_right
    stdout.flush()

    # map left reads to reference genome
    out_sam_left_name = splitext(basename(left_reads))[0] + '_left.sam'
    out_sam_left_path = join(output_directory, out_sam_left_name)
    print 'Iterative mapping of left reads (using ' + str(threads_number) + ' threads)...'
    stdout.flush()
    sams_left = iterative_mapping(genome_index, left_reads, out_sam_left_path, \
                                  range_start_left, range_stop_left, nthreads=threads_number,
                                  temp_dir=tmp_dir)
    print 'Done.'
    stdout.flush()

    # map right reads to reference genome
    out_sam_right_name = splitext(basename(right_reads))[0] + '_right.sam'
    out_sam_right_path = join(output_directory, out_sam_right_name)
    print 'Iterative mapping of right reads (using ' + str(threads_number) + ' threads)...'
    stdout.flush()
    sams_right = iterative_mapping(genome_index, right_reads, out_sam_right_path, \
                                   range_start_right, range_stop_right, nthreads=threads_number,
                                   temp_dir=tmp_dir)
    print 'Done.'
    stdout.flush()

    # load reference genome sequence
    print 'Load reference genome sequence...'
    stdout.flush()
    chroms = chromosomes[:]
    genome_seq = parse_fasta(genome_fasta, chr_names=chroms)
    print 'Done.'
    stdout.flush()

    # create files with information about every left and right read 
    # and about their placement with respect to restriction sites
    tsv_left_name = splitext(basename(left_reads))[0] + '_left.tsv'
    tsv_left = join(output_directory, tsv_left_name)
    tsv_right_name = splitext(basename(right_reads))[0] + '_right.tsv'
    tsv_right = join(output_directory, tsv_right_name)
    print 'Get information about restriction sites and reads placement...'
    stdout.flush()
    parse_sam(sams_left, sams_right, tsv_left, tsv_right, genome_seq, enzyme, \
              verbose=True, ncpus=8)
    print 'Done.'
    stdout.flush()

    # create file with both left and right reads that uniquelly mapped to reference genome
    if reads_fastq != '': # left and right reads are stored in one file
        common_reads_prefix = splitext(basename(reads_fastq))[0]
    else: # left and right reads are stored separately
        common_reads_prefix = output_prefix
    uniq_reads_name = common_reads_prefix + '_both_map_uniq.tsv'
    uniq_reads = join(output_directory, uniq_reads_name)
    print 'Merge info about left and right reads in one file...'
    stdout.flush()
    get_intersection(tsv_left, tsv_right, uniq_reads, verbose=True)
    print 'Done.'
    stdout.flush()

    # find read IDs that are filtered by default TADbit filters
    print 'Mask reads...'
    stdout.flush()
    # debug
    print "uniq_reads =", uniq_reads
    masked = filter_reads(uniq_reads)
    print 'Done.'
    stdout.flush()

    # apply all filters (exclude reads that were filtered)
    print 'Filter masked reads...'
    stdout.flush()
    filtered_reads_name = common_reads_prefix + '_filtered.tsv'
    filtered_reads = join(output_directory, filtered_reads_name)
    apply_filter(uniq_reads, filtered_reads, masked)
    print 'Done.'
    stdout.flush()

    # create matrices (one matrix per chromosome)
    print 'Create Hi-C maps (one per chromosome)...'
    stdout.flush()
    hic_map(filtered_reads, resolution=res, by_chrom='intra', savedata=output_directory)
    print 'Done.'
    stdout.flush()
    print 'Add resolution (' + str(resolution) + ') to matrix filenames...'
    stdout.flush()
    add_resolution(chromosomes, resolution, output_directory)
    print 'Done.'
    stdout.flush()
    print 'Add headers to matrix files...'
    stdout.flush()
    add_headers(chromosomes, resolution, output_directory)
    print 'Done.'
    stdout.flush()
    if clean_tmp: # Remove all SAM and TSV files from the output directory
        print 'Remove SAM and TSV files from the output directory.'
        stdout.flush()
        map(os.remove, glob.glob(out_sam_left_path + '*'))
        map(os.remove, glob.glob(out_sam_right_path + '*'))
        map(os.remove, glob.glob(join(output_directory, '*.tsv')))
        print 'Done.'
        stdout.flush()

示例#3

0

显示文件

文件： make_matrices.py 项目： sidorov-si/tad-call

def make_matrices(left_reads_fastq, right_reads_fastq, reads_fastq, genome_fasta, genome_index, \
                  output_directory, output_prefix, enzyme, res, chromosomes, threads_number, \
                  clean_tmp, tmp_dir):

    print 'Begin to process reads.'

    left_reads = ''
    right_reads = ''
    if reads_fastq != '':  # left and right reads are stored in one file
        range_start_left, range_stop_left, \
        range_start_right, range_stop_right = calc_left_right_ranges(reads_fastq)
        print 'Reads:                     ', reads_fastq
        left_reads = reads_fastq
        right_reads = reads_fastq
    else:  # left and right reads are stored separately
        range_start_left, range_stop_left, \
        range_start_right, range_stop_right = calc_range(left_reads_fastq)
        print 'Left reads:                ', left_reads_fastq
        print 'Right reads:               ', right_reads_fastq
        print 'Output prefix:             ', output_prefix
        left_reads = left_reads_fastq
        right_reads = right_reads_fastq

    print 'Reference genome FASTA:    ', genome_fasta
    print 'Reference genome GEM index:', genome_index
    print 'Output directory:          ', output_directory
    print 'Temp directory:            ', tmp_dir
    print 'Enzyme:                    ', enzyme
    print 'Resolution:                ', res, 'bp'
    print 'Number of threads:         ', threads_number
    print 'Start pos for left reads:  ', range_start_left
    print 'Stop pos for left reads:   ', range_stop_left
    print 'Start pos for right reads: ', range_start_right
    print 'Stop pos for right reads:  ', range_stop_right
    stdout.flush()

    # map left reads to reference genome
    out_sam_left_name = splitext(basename(left_reads))[0] + '_left.sam'
    out_sam_left_path = join(output_directory, out_sam_left_name)
    print 'Iterative mapping of left reads (using ' + str(
        threads_number) + ' threads)...'
    stdout.flush()
    sams_left = iterative_mapping(genome_index, left_reads, out_sam_left_path, \
                                  range_start_left, range_stop_left, nthreads=threads_number,
                                  temp_dir=tmp_dir)
    print 'Done.'
    stdout.flush()

    # map right reads to reference genome
    out_sam_right_name = splitext(basename(right_reads))[0] + '_right.sam'
    out_sam_right_path = join(output_directory, out_sam_right_name)
    print 'Iterative mapping of right reads (using ' + str(
        threads_number) + ' threads)...'
    stdout.flush()
    sams_right = iterative_mapping(genome_index, right_reads, out_sam_right_path, \
                                   range_start_right, range_stop_right, nthreads=threads_number,
                                   temp_dir=tmp_dir)
    print 'Done.'
    stdout.flush()

    # load reference genome sequence
    print 'Load reference genome sequence...'
    stdout.flush()
    chroms = chromosomes[:]
    genome_seq = parse_fasta(genome_fasta, chr_names=chroms)
    print 'Done.'
    stdout.flush()

    # create files with information about every left and right read
    # and about their placement with respect to restriction sites
    tsv_left_name = splitext(basename(left_reads))[0] + '_left.tsv'
    tsv_left = join(output_directory, tsv_left_name)
    tsv_right_name = splitext(basename(right_reads))[0] + '_right.tsv'
    tsv_right = join(output_directory, tsv_right_name)
    print 'Get information about restriction sites and reads placement...'
    stdout.flush()
    parse_sam(sams_left, sams_right, tsv_left, tsv_right, genome_seq, enzyme, \
              verbose=True, ncpus=8)
    print 'Done.'
    stdout.flush()

    # create file with both left and right reads that uniquelly mapped to reference genome
    if reads_fastq != '':  # left and right reads are stored in one file
        common_reads_prefix = splitext(basename(reads_fastq))[0]
    else:  # left and right reads are stored separately
        common_reads_prefix = output_prefix
    uniq_reads_name = common_reads_prefix + '_both_map_uniq.tsv'
    uniq_reads = join(output_directory, uniq_reads_name)
    print 'Merge info about left and right reads in one file...'
    stdout.flush()
    get_intersection(tsv_left, tsv_right, uniq_reads, verbose=True)
    print 'Done.'
    stdout.flush()

    # find read IDs that are filtered by default TADbit filters
    print 'Mask reads...'
    stdout.flush()
    masked = filter_reads(uniq_reads)
    print 'Done.'
    stdout.flush()

    # apply all filters (exclude reads that were filtered)
    print 'Filter masked reads...'
    stdout.flush()
    filtered_reads_name = common_reads_prefix + '_filtered.tsv'
    filtered_reads = join(output_directory, filtered_reads_name)
    apply_filter(uniq_reads, filtered_reads, masked)
    print 'Done.'
    stdout.flush()

    # create matrices (one matrix per chromosome)
    print 'Create Hi-C maps (one per chromosome)...'
    stdout.flush()
    hic_map(filtered_reads,
            resolution=res,
            by_chrom='intra',
            savedata=output_directory)
    print 'Done.'
    stdout.flush()
    print 'Add resolution (' + str(resolution) + ') to matrix filenames...'
    stdout.flush()
    add_resolution(chromosomes, resolution, output_directory)
    print 'Done.'
    stdout.flush()
    print 'Add headers to matrix files...'
    stdout.flush()
    add_headers(chromosomes, resolution, output_directory)
    print 'Done.'
    stdout.flush()
    if clean_tmp:  # Remove all SAM and TSV files from the output directory
        print 'Remove SAM and TSV files from the output directory.'
        stdout.flush()
        remove(out_sam_left_path + '*')
        remove(out_sam_right_path + '*')
        remove(join(output_directory, '*.tsv'))
        print 'Done.'
        stdout.flush()

示例#4

0

显示文件

            out_sam_path         = OUTPATH + '%s_r2.txt' % rep,
            temp_dir             = PATH + 'tmp_dir/',
            range_start          = range(80, 55, -5), # starts with a flag sequence
            range_stop           = [100] * 5,
            nthreads             = 8,  # on intel corei7 CPUs 4 threads are as fast as
                                       # 8, but leave some room for you other applications
            max_reads_per_chunk  = chunk,
            single_end           = True)

sams1 = [OUTPATH + fnam for fnam in os.listdir(OUTPATH) if fnam.rsplit('.', 2)[0].endswith('_r1.txt')]
sams2 = [OUTPATH + fnam for fnam in os.listdir(OUTPATH) if fnam.rsplit('.', 2)[0].endswith('_r2.txt')]

print 'created thes SAM files:', sams2

parse_sam(sams1, sams2, frags, 
          OUTPATH + 'reads1_%s.tsv' % rep, OUTPATH + 'reads2_%s.tsv' % rep,
          genome_seq, 'HindIII', verbose=True)

reads1 = OUTPATH + 'reads1_%s.tsv' % rep
reads2 = OUTPATH + 'reads2_%s.tsv' % rep
reads  = OUTPATH + 'reads12_%s.tsv' % rep

get_intersection(reads1, reads2, reads, verbose=True)

from pytadbit.mapping.analyze import hic_map
hic_map(reads, genome_seq, resolution=100000, savedata='lala')

from pytadbit.mapping.analyze import plot_genomic_distribution

plot_genomic_distribution(reads, resolution=50000, genome_seq=genome_seq) # because I know it

示例#5

0

显示文件

文件： simulate_genomic_matrix.py 项目： liuhui9312/tadbit

            pos1 -= 3
            break
    read1 = {'crm': crm, 'pos': pos1, 'flag': sd1, 'id': 'lala04.%012d' % (i)}
    read2 = {'crm': crm, 'pos': pos2, 'flag': sd2, 'id': 'lala04.%012d' % (i)}
    out1.write(read_str.format(**read1))
    out2.write(read_str.format(**read2))
    # TOO CLOSE FROM RE
    

out1.close()
out2.close()

# PARSE SAM
from pytadbit.parsers.sam_parser import parse_sam

parse_sam(['test_read1.sam~'], ['test_read2.sam~'],
          'lala1~', 'lala2~', genome, re_name='DPNII', mapper='GEM')

# GET INTERSECTION
from pytadbit.mapping.mapper import get_intersection

get_intersection('lala1~', 'lala2~', 'lala~')

# FILTER
from pytadbit.mapping.filter import filter_reads

masked = filter_reads('lala~')