예제 #1
0
def map_reads(cell_line, path_input, path_output, genome_version):
    if not os.path.exists(path_output + 'bam/' + cell_line):
        os.mkdir(path_output + 'bam/' + cell_line)

    mapping.iterative_mapping(
        bowtie_path='/usr/bin/bowtie2',
        bowtie_index_path=path_output + 'index_' + cell_line + '/' +
        genome_version,
        fastq_path=path_input + cell_line + '_R1.fastq.gz',
        out_sam_path=path_output + 'bam/' + cell_line + '/' + cell_line +
        '_R1.bam',
        min_seq_len=25,
        seq_start=4,
        len_step=3,
        nthreads=8,
        temp_dir=path_output + 'tmp_' + cell_line,
        bowtie_flags='--very-sensitive')

    mapping.iterative_mapping(
        bowtie_path='/usr/bin/bowtie2',
        bowtie_index_path=path_output + 'index_' + cell_line + '/' +
        genome_version,
        fastq_path=path_input + cell_line + '_R2.fastq.gz',
        out_sam_path=path_output + 'bam/' + cell_line + '/' + cell_line +
        '_R2.bam',
        min_seq_len=25,
        seq_start=4,
        len_step=3,
        nthreads=8,
        temp_dir=path_output + 'tmp_' + cell_line,
        bowtie_flags='--very-sensitive')
예제 #2
0
def step1(hiclib_path, ## the path of hiclib folder on machine
          dataset='Kalhor2012NB', 
          sraid = 'SRR071231', 
          readlen = 40): ## each read with length 40
    ''' 1. Map reads to the genome
        http://mirnylab.bitbucket.org/hiclib/tutorial/01_iterative_mapping.html
    '''

    ## Adopted from hiclib tutorial
    import os
    import logging
    from hiclib import mapping
    from mirnylib import h5dict, genome

    logging.basicConfig(level=logging.DEBUG)

    # A. Map the reads iteratively.
    mapping.iterative_mapping(
        bowtie_path=hiclib_path+'/bin/bowtie2/bowtie2',
        bowtie_index_path=hiclib_path+'/bin/bowtie2/index/hg19',
        fastq_path='../data/SRA/'+dataset+'/'+sraid+'/'+sraid+'.sra',
        out_sam_path='../data/SRA/'+sraid+'_1.bam',
        min_seq_len=25,
        len_step=5,
        seq_start=0,
        seq_end=readlen,
        nthreads=12, # on intel corei7 CPUs 4 threads are as fast as
                     # 8, but leave some room for you other applications
        #max_reads_per_chunk = 10000000,  #optional, on low-memory machines
        temp_dir='../data/SRA/',  # optional, keep temporary files here
        bowtie_flags='--very-sensitive',
        bash_reader=hiclib_path+'/bin/sra/bin/fastq-dump -Z')

    mapping.iterative_mapping(
        bowtie_path=hiclib_path+'/bin/bowtie2/bowtie2',
        bowtie_index_path=hiclib_path+'/bin/bowtie2/index/hg19',
        fastq_path='../data/SRA/'+dataset+'/'+sraid+'/'+sraid+'.sra',
        out_sam_path='../data/SRA/'+sraid+'_2.bam',
        min_seq_len=25,
        len_step=5,
        seq_start=readlen,
        seq_end=2*readlen,
        nthreads=12,  
        #max_reads_per_chunk = 10000000, 
        temp_dir='../data/SRA/',  
        bowtie_flags='--very-sensitive',
        bash_reader=hiclib_path+'/bin/sra/bin/fastq-dump -Z')

    # B. Parse the mapped sequences into a Python data structure,
    #    assign the ultra-sonic fragments to restriction fragments.
    mapped_reads = h5dict.h5dict(sraid + '_mapped_reads.hdf5') ## to local folder
    genome_db    = genome.Genome(hiclib_path+'/fasta/hg19', readChrms=['#', 'X'])

    mapping.parse_sam(
        sam_basename1='../data/SRA/'+sraid+'_1.bam',
        sam_basename2='../data/SRA/'+sraid+'_2.bam',
        out_dict=mapped_reads,
        genome_db=genome_db, 
        enzyme_name='HindIII')
예제 #3
0
    def doOne(inData, saveSams=True):
        file1, file2, outfile = inData
        print("Mapping {0} and {1} into {2}".format(*inData))

        for onefile in file1, file2:
            a = gzip.open(onefile, 'r')
            a.readline()
            length = len(a.readline()) - 1
            if length < 10:
                raise ValueError(
                    "Length of your sequence is {0}. Something is wrong".
                    format(length))
            minlen, step = calculateStep(length - seqSkipStart, minMapLen)

            mapping.iterative_mapping(
                bowtie_path=bowtiePath,
                bowtie_index_path=bowtieIndex,
                fastq_path=onefile,
                out_sam_path=os.path.join(samFolder,
                                          os.path.split(onefile)[1] + ".sam"),
                seq_start=seqSkipStart,
                min_seq_len=
                minlen,  # for bacteria mimimal mappable length is 15 bp, so I start with something slightly longer
                len_step=step,  # and go with a usualy step
                nthreads=
                threads,  # on intel corei7 CPUs 4 threads are as fast as
                # 8, but leave some room for you other applications
                # max_reads_per_chunk = 10000000,  #optional, on low-memory machines
                temp_dir=tmpDir,
                bowtie_flags=bowtieFlags,
            )

        os.remove(file1)
        os.remove(file2)

        # Second step. Parse the mapped sequences into a Python data structure,
        #    assign the ultra-sonic fragments to restriction fragments.
        mapped_reads = h5dict.h5dict(outfile)
        sf1, sf2 = [
            os.path.join(samFolder,
                         os.path.split(onefile)[1] + ".sam")
            for onefile in [file1, file2]
        ]
        mapping.parse_sam(sam_basename1=sf1,
                          sam_basename2=sf2,
                          out_dict=mapped_reads,
                          genome_db=genome_db,
                          save_seqs=False,
                          maxReads=int(chunkSize * 1.6),
                          IDLen=50)
        for i in os.listdir(samFolder):
            if ((os.path.split(file1)[1] in i) or
                (os.path.split(file2)[1] in i)) and not saveSams:
                print("deleting", i)
                os.remove(os.path.join(samFolder, i))
예제 #4
0
def map_reads(first_fq, second_fq, outfile, nice):

    # set the niceness of this sub-process:
    os.nice(nice)

    first_sam = first_fq.split(".fastq.gz")[0] + ".sam"
    second_sam = second_fq.split(".fastq.gz")[0] + ".sam"

    # map the first fastq file -> sam file
    length = check_len(first_fq)
    min_len, step_size = calculate_step(length - seq_skip_start, min_map_len)
    mapping.iterative_mapping(
        bowtie_path=bowtie_path,
        bowtie_index_path=bowtie_index,
        fastq_path=first_fq,
        out_sam_path=os.path.join(args.samdir, first_sam),
        min_seq_len=min_len,
        len_step=step_size,
        seq_start=seq_skip_start,
        nthreads=threads,
        bowtie_flags=bowtie_flags)

    # map the second fastq file -> sam file
    length = check_len(second_fq)
    min_len, step_size = calculate_step(length - seq_skip_start, min_map_len)
    mapping.iterative_mapping(
        bowtie_path=bowtie_path,
        bowtie_index_path=bowtie_index,
        fastq_path=second_fq,
        out_sam_path=os.path.join(args.samdir, second_sam),
        min_seq_len=min_len,
        len_step=step_size,
        seq_start=seq_skip_start,
        nthreads=threads,
        bowtie_flags=bowtie_flags)

    # parse the mapped sequences into a the hdf5 dict structure,
    # assign the ultra-sonic fragments to restriction fragments. <- what the hell does this even mean?
    out_dict = os.path.join(args.samdir, outfile)
    mapped_reads = h5dict.h5dict(out_dict)
    sf1, sf2 = [os.path.join(args.samdir, first_sam), os.path.join(args.samdir, second_sam)]
    mapping.parse_sam(sam_basename1=sf1, sam_basename2=sf2,
                      out_dict=mapped_reads, genome_db=genome_db, save_seqs=False, maxReads=10000000, IDLen=50,
                      enzyme_name='HindIII')
예제 #5
0
def func():
    #if not os.path.exists('tmp/'):
    #    os.mkdir('tmp/')

    # Map the reads iteratively.
    from hiclib import mapping
    #from mirnylib import h5dict, genome

    mapping.iterative_mapping(
        bowtie_path='bowtie2',
        bowtie_index_path='../genomes/mm9/index/mm9',
        fastq_path=args.file,
        out_sam_path=path.join('/exports/eddie/scratch/s1529682/bams/',
                               path.split(args.file)[1] + '.bam'),
        min_seq_len=25,
        len_step=5,
        nthreads=4,
        #max_reads_per_chunk = 10000000,  #optional, on low-memory machines
        temp_dir=tempfile.gettempdir(),  # optional, keep temporary files here
        bowtie_flags='--very-sensitive')
예제 #6
0
def mapFile(fastq, read):
    global options
    global args

    fileName, fileExtension = os.path.splitext(fastq)
    bamOutput = options.outputDir + fileName.split(
        os.sep)[-1] + '_R' + str(read) + '.bam'

    if (fileExtension == '.sra'):
        if (options.verbose):
            print >> sys.stdout, "Map short read archive %s utilizing %s" % (
                fastq, options.sra)

        mapping.iterative_mapping(bowtie_path=options.bowtie,
                                  bowtie_index_path=options.index,
                                  fastq_path=fastq,
                                  out_sam_path=bamOutput,
                                  min_seq_len=25,
                                  len_step=5,
                                  seq_start=options.readLength * (read - 1),
                                  seq_end=options.readLength * (read),
                                  nthreads=options.cpus,
                                  temp_dir=options.tmpDir,
                                  bowtie_flags='--very-sensitive',
                                  bash_reader=options.sra + ' -Z')

    else:
        if (options.verbose):
            print >> sys.stdout, "Map fastq %s" % (fastq)

        mapping.iterative_mapping(bowtie_path=options.bowtie,
                                  bowtie_index_path=options.index,
                                  fastq_path=fastq,
                                  out_sam_path=bamOutput,
                                  min_seq_len=25,
                                  len_step=5,
                                  nthreads=options.cpus,
                                  temp_dir=options.tmpDir,
                                  bowtie_flags='--very-sensitive')

    return bamOutput
예제 #7
0
def mapFile(fastq, read):
	global options
	global args

	fileName, fileExtension = os.path.splitext(fastq)
	bamOutput = options.outputDir+fileName.split(os.sep)[-1]+'.bam'
	
	if (fileExtension == '.sra'):
		if (options.verbose):
			print >> sys.stdout, "Map short read archive %s utilizing %s" % (fastq, options.sra)

		mapping.iterative_mapping(
		    bowtie_path=options.bowtie,
		    bowtie_index_path=options.index,
		    fastq_path=fastq,
		    out_sam_path=bamOutput,
		    min_seq_len=options.minSeqLength,
		    len_step=options.stepSize,
		    seq_start=options.readLength*(read-1),
		    seq_end=options.readLength*(read),
		    nthreads=options.cpus,
		    temp_dir=options.tmpDir, 
		    bowtie_flags='--very-sensitive',
		    bash_reader=options.sra+' -Z')
	
	else:
		if (options.verbose):
			print >> sys.stdout, "Map fastq %s" % (fastq)
		
		mapping.iterative_mapping(
		    bowtie_path=options.bowtie,
		    bowtie_index_path=options.index,
		    fastq_path=fastq,
		    out_sam_path=bamOutput,
		    min_seq_len=options.minSeqLength,
		    len_step=options.stepSize,
		    nthreads=options.cpus,
		    temp_dir=options.tmpDir, 
		    bowtie_flags='--very-sensitive')
		    
	return bamOutput
예제 #8
0
FASTQ_fpath = tmp_folder + '/' + base_filename + '.sra'
out_sam_fpath = tmp_folder + '/' + base_filename
genome_name = 'mm9'

if not os.path.exists(tmp_folder):
    os.mkdir(tmp_folder)

# A. Map the reads iteratively.
mapping.iterative_mapping(
    bowtie_path='../../bin/bowtie2/bowtie2',
    bowtie_index_path='../../bin/bowtie2/index/' + genome_name,
    fastq_path=FASTQ_fpath,
    out_sam_path=out_sam_fpath + '_1.bam',
    min_seq_len=25,
    len_step=5,
    seq_start=0,
    seq_end=49,
    nthreads=8,  # on intel corei7 CPUs 4 threads are as fast as
    # 8, but leave some room for you other applications
    #max_reads_per_chunk = 10000000,  #optional, on low-memory machines
    temp_dir=tmp_folder,  # optional, keep temporary files here
    bowtie_flags='--very-sensitive',
    bash_reader='../../bin/sra/bin/fastq-dump -Z')

mapping.iterative_mapping(
    bowtie_path='../../bin/bowtie2/bowtie2',
    bowtie_index_path='../../bin/bowtie2/index/' + genome_name,
    fastq_path=FASTQ_fpath,
    out_sam_path=out_sam_fpath + '_2.bam',
    min_seq_len=25,
    len_step=5,
예제 #9
0
for i in sorted(os.listdir(fastqDir)):
    expName = i
    folder = os.path.join(fastqDir, expName)
    file1 = glob.glob(folder+"/*1.fastq")[0]
    file2 = glob.glob(folder+"/*2.fastq")[0]
    if not os.path.exists(file1): raise
    if not os.path.exists(file2): raise
    
# A. Map the reads iteratively.
    mapping.iterative_mapping(
        bowtie_path=bowtiePath,
        bowtie_index_path=bowtieIndex,
        fastq_path=file1,
        out_sam_path='sams/%s_1.bam' % expName,
        min_seq_len=10,   # for bacteria mimimal mappable length is slightly over 10bp, so I start with 10bp 
        len_step=3,       # and go with a smaller step
        seq_start=0,
        seq_end=40,
        nthreads=4,  # on intel corei7 CPUs 4 threads are as fast as
                     # 8, but leave some room for you other applications
        #max_reads_per_chunk = 10000000,  #optional, on low-memory machines
        temp_dir='tmp',  # optional, keep temporary files here
        bowtie_flags='--very-sensitive')

    mapping.iterative_mapping(
        bowtie_path=bowtiePath,
        bowtie_index_path=bowtieIndex,
        fastq_path=file2,
        out_sam_path='sams/%s_2.bam' % expName,
        min_seq_len=10,
        len_step=3,
        seq_start=0,
예제 #10
0
FASTQ_fpath="/mnt/storage/home/vsfishman/tmp/Distr/LA2008_NcoI/LA.fastq"
out_sam_fpath=tmp_folder+'/'+base_filename
genome_name='mm9'

if not os.path.exists(tmp_folder):
    os.mkdir(tmp_folder)

#A. Map the reads iteratively.

mapping.iterative_mapping(
    bowtie_path='../bin/bowtie2/bowtie2',
    bowtie_index_path='../bin/bowtie2/index/'+genome_name,
    fastq_path=FASTQ_fpath,
    out_sam_path=out_sam_fpath+'_2.bam',
    min_seq_len=25,
    len_step=5,
    seq_start=76,
    seq_end=151,
    nthreads=8,  
    #max_reads_per_chunk = 10000000, 
    temp_dir=tmp_folder,  
    bowtie_flags='--very-sensitive',
    bash_reader=None)#../../bin/sra/bin/fastq-dump -Z')


# B. Parse the mapped sequences into a Python data structure,
#    assign the ultra-sonic fragments to restriction fragments.
#mapped_reads = h5dict.h5dict(maped_reads_filepath)
#genome_db    = genome.Genome('../fasta/'+genome_name, readChrms=['#', 'X'])

#mapping.parse_sam(
    #sam_basename1=out_sam_fpath+'_1.bam',
예제 #11
0
    lock.close()

    atexit.register(cleanFile, lockName)

    os.system("rm -rf {0}/{1}*".format(samFolder, expName.replace(".sra", "")))

    # First step. Map the reads iteratively.
    mapping.iterative_mapping(
        bowtie_path=bowtiePath,
        bowtie_index_path=bowtieIndex,
        fastq_path=file1,
        out_sam_path='{0}/{1}_1.bam'.format(samFolder, expName),
        min_seq_len=
        minlen,  # for bacteria mimimal mappable length is 15 bp, so I start with something slightly longer
        len_step=step,  # and go with a usualy step
        nthreads=threads,  # on intel corei7 CPUs 4 threads are as fast as
        # 8, but leave some room for you other applications
        # max_reads_per_chunk = 10000000,  #optional, on low-memory machines
        temp_dir=tmpDir,
        seq_start=0,
        seq_end=length,
        bash_reader="fastq-dump -Z",
        bowtie_flags=" --very-sensitive ",
    )

    mapping.iterative_mapping(
        bowtie_path=bowtiePath,
        bowtie_index_path=bowtieIndex,
        fastq_path=file1,
        out_sam_path='{0}/{1}_2.bam'.format(samFolder, expName),
        min_seq_len=minlen,
예제 #12
0
import logging
from hiclib import mapping
from mirnylib import h5dict, genome

logging.basicConfig(level=logging.DEBUG)

if not os.path.exists('../data/tmp'):
    os.mkdir('../data/tmp')

# Map the reads iteratively.
mapping.iterative_mapping(
    bowtie_path='/usr/bin/bowtie2',
    bowtie_index_path='../Index/hg19',
    fastq_path='../data/SRR1658595_10M_1.fastq',
    out_sam_path='../data/SRR1658595_10M_1.bam',
    min_seq_len=25,
    len_step=5,
    seq_start=0,
    seq_end=35,
    nthreads=2,
    temp_dir='../data/tmp',
    bowtie_flags='--very-sensitive')

mapping.iterative_mapping(
    bowtie_path='/usr/bin/bowtie2',
    bowtie_index_path='../Index/hg19',
    fastq_path='../data/SRR1658595_10M_2.fastq',
    out_sam_path='../data/SRR1658595_10M_2.bam',
    min_seq_len=25,
    len_step=5,
    seq_start=0,
    seq_end=35,
예제 #13
0
def step1(
        hiclib_path,  ## the path of hiclib folder on machine
        dataset='Kalhor2012NB',
        sraid='SRR071231',
        readlen=40):  ## each read with length 40
    ''' 1. Map reads to the genome
        http://mirnylab.bitbucket.org/hiclib/tutorial/01_iterative_mapping.html
    '''

    ## Adopted from hiclib tutorial
    import os
    import logging
    from hiclib import mapping
    from mirnylib import h5dict, genome

    logging.basicConfig(level=logging.DEBUG)

    # A. Map the reads iteratively.
    mapping.iterative_mapping(
        bowtie_path=hiclib_path + '/bin/bowtie2/bowtie2',
        bowtie_index_path=hiclib_path + '/bin/bowtie2/index/hg19',
        fastq_path='../data/SRA/' + dataset + '/' + sraid + '/' + sraid +
        '.sra',
        out_sam_path='../data/SRA/' + sraid + '_1.bam',
        min_seq_len=25,
        len_step=5,
        seq_start=0,
        seq_end=readlen,
        nthreads=12,  # on intel corei7 CPUs 4 threads are as fast as
        # 8, but leave some room for you other applications
        #max_reads_per_chunk = 10000000,  #optional, on low-memory machines
        temp_dir='../data/SRA/',  # optional, keep temporary files here
        bowtie_flags='--very-sensitive',
        bash_reader=hiclib_path + '/bin/sra/bin/fastq-dump -Z')

    mapping.iterative_mapping(
        bowtie_path=hiclib_path + '/bin/bowtie2/bowtie2',
        bowtie_index_path=hiclib_path + '/bin/bowtie2/index/hg19',
        fastq_path='../data/SRA/' + dataset + '/' + sraid + '/' + sraid +
        '.sra',
        out_sam_path='../data/SRA/' + sraid + '_2.bam',
        min_seq_len=25,
        len_step=5,
        seq_start=readlen,
        seq_end=2 * readlen,
        nthreads=12,
        #max_reads_per_chunk = 10000000,
        temp_dir='../data/SRA/',
        bowtie_flags='--very-sensitive',
        bash_reader=hiclib_path + '/bin/sra/bin/fastq-dump -Z')

    # B. Parse the mapped sequences into a Python data structure,
    #    assign the ultra-sonic fragments to restriction fragments.
    mapped_reads = h5dict.h5dict(sraid +
                                 '_mapped_reads.hdf5')  ## to local folder
    genome_db = genome.Genome(hiclib_path + '/fasta/hg19',
                              readChrms=['#', 'X'])

    mapping.parse_sam(sam_basename1='../data/SRA/' + sraid + '_1.bam',
                      sam_basename2='../data/SRA/' + sraid + '_2.bam',
                      out_dict=mapped_reads,
                      genome_db=genome_db,
                      enzyme_name='HindIII')
예제 #14
0
from mirnylib import h5dict, genome

logging.basicConfig(level=logging.DEBUG)

if not os.path.exists('../../data/sample/tmp/'):
    os.mkdir('../../data/sample/tmp/')

# A. Map the reads iteratively.
mapping.iterative_mapping(
    bowtie_path='../../bin/bowtie2/bowtie2',
    bowtie_index_path='../../bin/bowtie2/index/hg19',
    fastq_path='../../data/sample/SRR027956.sra',
    out_sam_path='../../data/sample/SRR027056_1.bam',
    min_seq_len=25,
    len_step=5,
    seq_start=0,
    seq_end=75,
    nthreads=4,  # on intel corei7 CPUs 4 threads are as fast as
    # 8, but leave some room for you other applications
    #max_reads_per_chunk = 10000000,  #optional, on low-memory machines
    temp_dir='../../data/sample/tmp',  # optional, keep temporary files here
    bowtie_flags='--very-sensitive',
    bash_reader='../../bin/sra/bin/fastq-dump -Z')

mapping.iterative_mapping(
    bowtie_path='../../bin/bowtie2/bowtie2',
    bowtie_index_path='../../bin/bowtie2/index/hg19',
    fastq_path='../../data/sample/SRR027956.sra',
    out_sam_path='../../data/sample/SRR027056_2.bam',
    min_seq_len=25,
    len_step=5,
예제 #15
0
from hiclib import mapping
from mirnylib import h5dict, genome

logging.basicConfig(level=logging.DEBUG)

if not os.path.exists('../../data/serov/tmp/'):
    os.mkdir('../../data/serov/tmp/')

# A. Map the reads iteratively.
mapping.iterative_mapping(
    bowtie_path='../../bin/bowtie2/bowtie2',
    bowtie_index_path='../../bin/bowtie2/index/mm10',
    fastq_path='../../data/serov/HiC_Sp.fastq',
    out_sam_path='../../data/serov/HiC_Sp_1.bam',
    min_seq_len=25,
    len_step=5,
    seq_start=0,
    seq_end=50,
    nthreads=4,  # on intel corei7 CPUs 4 threads are as fast as
    # 8, but leave some room for you other applications
    #max_reads_per_chunk = 10000000,  #optional, on low-memory machines
    temp_dir='../../data/serov/tmp',  # optional, keep temporary files here
    bowtie_flags='--very-sensitive')
#   bash_reader='../../bin/sra/bin/fastq-dump -Z')

mapping.iterative_mapping(
    bowtie_path='../../bin/bowtie2/bowtie2',
    bowtie_index_path='../../bin/bowtie2/index/mm10',
    fastq_path='../../data/serov/HiC_Sp.fastq',
    out_sam_path='../../data/serov/HiC_Sp_2.bam',
    min_seq_len=25,
    len_step=5,
예제 #16
0
    os.system("rm -rf {0}/{1}*".format(samFolder, expName.replace(".sra", "")))






# First step. Map the reads iteratively.
    mapping.iterative_mapping(
        bowtie_path=bowtiePath,
        bowtie_index_path=bowtieIndex,
        fastq_path=file1,
        out_sam_path='{0}/{1}_1.bam'.format(samFolder, expName),
        min_seq_len=minlen,  # for bacteria mimimal mappable length is 15 bp, so I start with something slightly longer
        len_step=step,  # and go with a usualy step
        nthreads=threads,  # on intel corei7 CPUs 4 threads are as fast as
                     # 8, but leave some room for you other applications
        # max_reads_per_chunk = 10000000,  #optional, on low-memory machines
        temp_dir=tmpDir,
        seq_start=0,
        seq_end=length,
        bash_reader="fastq-dump -Z",
        bowtie_flags=" --very-sensitive ",
        )

    mapping.iterative_mapping(
        bowtie_path=bowtiePath,
        bowtie_index_path=bowtieIndex,
        fastq_path=file1,
        out_sam_path='{0}/{1}_2.bam'.format(samFolder, expName),
        min_seq_len=minlen,
        len_step=step,