def merge_bams(bam1, bam2, output_bam, cpus=cpu_count(), samtools='samtools', verbose=True): """ Merge two bam files with samtools into one. :param bam1: path to first file :param bam2: path to second file """ samtools = which(samtools) if verbose: print(' - Mergeing experiments') system(samtools + ' merge -@ %d %s %s %s' % (cpus, output_bam, bam1, bam2)) if verbose: print(' - Indexing new BAM file') # check samtools version number and modify command line version = LooseVersion([ l.split()[1] for l in Popen(samtools, stderr=PIPE, universal_newlines=True).communicate()[1].split('\n') if 'Version' in l ][0]) if version >= LooseVersion('1.3.1'): system(samtools + ' index -@ %d %s' % (cpus, output_bam)) else: system(samtools + ' index %s' % (output_bam))
def oneD(tmp_dir='.', form='tot ~ s(map) + s(cg) + s(res)', p_fit=None, seed=1, **kwargs): """ Normalizes according to oneD normalization that takes into account the GC content, mappability and the number of restriction sites per bin. Vidal, E., le Dily, F., Quilez, J., Stadhouders, R., Cuartero, Y., Graf, T., Marti-Renom, Marc A., Beato, M., Filion, G. (2017). OneD: increasing reproducibility of Hi-C Samples with abnormal karyotypes. bioRxiv. http://doi.org/10.1101/148254 :param form: string representing an R Formulae :param None p_fit: proportion of data to be used in fitting (for very large datasets). Number between 0 and 1 :param kwargs: dictionary with keys present in the formula and values being lists of equal length. for example: oneD(tot=[1,2,3...], map=[1,2,3...], res=[1,2,3...], cg =[1,2,3...]) :returns: list of biases to use to normalize the raw matrix of interactions """ script_path = which('normalize_oneD.R') proc_par = ["Rscript", "--vanilla", script_path] in_csv = path.join(tmp_dir, 'tot.csv') proc_par.append(in_csv) csvfile = open(in_csv, 'w') headers = sorted(kwargs.keys()) csvfile.write(','.join(headers) + '\n') csvfile.write('\n'.join(','.join(str(kwargs[k][i]) for k in headers) for i in range(len(kwargs['tot']))) + '\n') csvfile.close() out_csv = path.join(tmp_dir, 'biases.csv') proc_par.append(out_csv) proc_par.append('"%s"' % (form)) if p_fit: proc_par.append(str(p_fit)) if seed > 1: proc_par.append(str(seed)) elif seed < 1: raise Exception(('ERROR: seed number (currently: %d) should be an ' 'interger greater than 1 (because of R)') % (seed)) proc = Popen(proc_par, stderr=PIPE, universal_newlines=True) err = proc.stderr.readlines() print('\n'.join(err)) biases_oneD = genfromtxt(out_csv, delimiter=',', dtype=float) return biases_oneD
def oneD(tmp_dir='.', form='tot ~ s(map) + s(cg) + s(res)', p_fit=None, seed=1, **kwargs): """ Normalizes according to oneD normalization that takes into account the GC content, mappability and the number of restriction sites per bin. Vidal, E., le Dily, F., Quilez, J., Stadhouders, R., Cuartero, Y., Graf, T., Marti-Renom, Marc A., Beato, M., Filion, G. (2017). OneD: increasing reproducibility of Hi-C Samples with abnormal karyotypes. bioRxiv. http://doi.org/10.1101/148254 :param form: string representing an R Formulae :param None p_fit: proportion of data to be used in fitting (for very large datasets). Number between 0 and 1 :param kwargs: dictionary with keys present in the formula and values being lists of equal length. for example: oneD(tot=[1,2,3...], map=[1,2,3...], res=[1,2,3...], cg =[1,2,3...]) :returns: list of biases to use to normalize the raw matrix of interactions """ script_path = which('normalize_oneD.R') proc_par = ["Rscript", "--vanilla", script_path] in_csv = path.join(tmp_dir, 'tot.csv') proc_par.append(in_csv) csvfile = open(in_csv, 'w') headers = sorted(kwargs.keys()) csvfile.write(','.join(headers) + '\n') csvfile.write('\n'.join(','.join(str(kwargs[k][i]) for k in headers) for i in xrange(len(kwargs['tot']))) + '\n') csvfile.close() out_csv = path.join(tmp_dir, 'biases.csv') proc_par.append(out_csv) proc_par.append('"%s"' % (form)) if p_fit: proc_par.append(str(p_fit)) if seed > 1: proc_par.append(str(seed)) elif seed < 1: raise Exception(('ERROR: seed number (currently: %d) should be an ' 'interger greater than 1 (because of R)') % (seed)) proc = Popen(proc_par, stderr=PIPE) err = proc.stderr.readlines() print '\n'.join(err) biases_oneD = genfromtxt(out_csv, delimiter=',', dtype=float) return biases_oneD
def _bowtie2_mapping(bowtie2_index_path, fastq_path1, out_map_path, fastq_path2=None, bowtie2_binary='bowtie2', bowtie2_params=None, **kwargs): """ """ bowtie2_index_path = os.path.abspath( os.path.expanduser(bowtie2_index_path)) fastq_path1 = os.path.abspath(os.path.expanduser(fastq_path1)) paired_map = False if fastq_path2: fastq_path2 = os.path.abspath(os.path.expanduser(fastq_path2)) paired_map = True out_map_path = os.path.abspath(os.path.expanduser(out_map_path)) nthreads = kwargs.get('nthreads', 8) # check that we have the GEM binary: bowtie2_binary = which(bowtie2_binary) if not bowtie2_binary: raise Exception('\n\nERROR: %s binary not found' % bowtie2_binary) # mapping print('TO %s' % bowtie2_binary, fastq_path1, fastq_path2) bowtie2_cmd = [ bowtie2_binary, '-x', bowtie2_index_path, '-p', str(nthreads), '--reorder', '-k', '1', '-S', out_map_path ] if paired_map: bowtie2_cmd += ['-1', fastq_path1, '-2', fastq_path2] else: bowtie2_cmd += ['-U', fastq_path1] if bowtie2_params: if isinstance(bowtie2_params, dict): for bow_param in bowtie2_params: bowtie2_cmd.append('-' + bow_param) if bowtie2_params[bow_param]: bowtie2_cmd.append(bowtie2_params[bow_param]) elif isinstance(bowtie2_params, list): bowtie2_cmd += bowtie2_params elif bowtie2_binary == 'bowtie2': bowtie2_cmd.append('--very-sensitive') print(' '.join(bowtie2_cmd)) try: # check_call(gem_cmd, stdout=PIPE, stderr=PIPE) out, err = Popen(bowtie2_cmd, stdout=PIPE, stderr=PIPE, universal_newlines=True).communicate() except CalledProcessError as e: print(out) print(err) raise Exception(e.output)
def _bowtie2_mapping(bowtie2_index_path, fastq_path1, out_map_path, fastq_path2 = None, bowtie2_binary='bowtie2', bowtie2_params=None, **kwargs): """ :param None focus: trims the sequence in the input FASTQ file according to a (start, end) position, or the name of a restriction enzyme. By default it uses the full sequence. """ bowtie2_index_path= os.path.abspath(os.path.expanduser(bowtie2_index_path)) fastq_path1 = os.path.abspath(os.path.expanduser(fastq_path1)) paired_map = False if fastq_path2: fastq_path2 = os.path.abspath(os.path.expanduser(fastq_path2)) paired_map = True out_map_path = os.path.abspath(os.path.expanduser(out_map_path)) nthreads = kwargs.get('nthreads' , 8) # check that we have the GEM binary: bowtie2_binary = which(bowtie2_binary) if not bowtie2_binary: raise Exception('\n\nERROR: bowtie2 binary not found') # mapping print 'TO BOWTIE2', fastq_path1, fastq_path2 bowtie2_cmd = [ bowtie2_binary, '-x', bowtie2_index_path, '-p', str(nthreads), '--reorder', '-S', out_map_path] if paired_map: bowtie2_cmd += ['-1',fastq_path1,'-2',fastq_path2] else: bowtie2_cmd += ['-U', fastq_path1] if bowtie2_params: for bow_param in bowtie2_params: bowtie2_cmd.append('-'+bow_param) if bowtie2_params[bow_param]: bowtie2_cmd.append(bowtie2_params[bow_param]) else: bowtie2_cmd.append('--very-sensitive') print ' '.join(bowtie2_cmd) try: # check_call(gem_cmd, stdout=PIPE, stderr=PIPE) out, err = Popen(bowtie2_cmd, stdout=PIPE, stderr=PIPE).communicate() except CalledProcessError as e: print out print err raise Exception(e.output)
def merge_bams(bam1, bam2, outbam, cpus = cpu_count(), samtools = 'samtools', verbose = True): """ Merge two bam files with samtools into one. :param bam1: path to first file :param bam2: path to second file """ samtools = which(samtools) if verbose: print ' - Mergeing experiments' system(samtools + ' merge -@ %d %s %s %s' % (cpus, outbam, bam1, bam2)) if verbose: print ' - Indexing new BAM file' # check samtools version number and modify command line version = LooseVersion([l.split()[1] for l in Popen(samtools, stderr=PIPE).communicate()[1].split('\n') if 'Version' in l][0]) if version >= LooseVersion('1.3.1'): system(samtools + ' index -@ %d %s' % (cpus, outbam)) else: system(samtools + ' index %s' % (outbam))
def oneD(tmp_dir='.', form='tot ~ s(map) + s(cg) + s(res)', **kwargs): """ Normalizes according to oneD normalization that takes into account the GC content, mappability and the number of restriction sites per bin. Vidal, E., le Dily, F., Quilez, J., Stadhouders, R., Cuartero, Y., Graf, T., Marti-Renom, Marc A., Beato, M., Filion, G. (2017). OneD: increasing reproducibility of Hi-C Samples with abnormal karyotypes. bioRxiv. http://doi.org/10.1101/148254 :param form: string representing an R Formulae :param kwargs: dictionary with keys present in the formula and values being lists of equal length. for example: oneD(tot=[1,2,3...], map=[1,2,3...], res=[1,2,3...], cg =[1,2,3...]) :returns: list of biases to use to normalize the raw matrix of interactions """ #=============================================================================== # try: # form = robjects.Formula(form) # except NameError: # raise Exception('ERROR: dryhic (https://github.com/qenvio/dryhic) not ' # 'installed, OneD normalization not available') # # info = robjects.DataFrame(dict((k, robjects.FloatVector(kwargs[k])) # for k in kwargs)) # # return map(float64, dryhic.oned(info, form)) #=============================================================================== script_path = which('normalize_oneD.R') proc_par = ["Rscript", "--vanilla", script_path] csvfile = path.join(tmp_dir,'tot.csv') proc_par.append(csvfile) with open(csvfile, "w") as output: writer = csv.writer(output, lineterminator='\n') writer.writerow(kwargs['tot']) csvfile = path.join(tmp_dir,'map.csv') proc_par.append(csvfile) with open(csvfile, "w") as output: writer = csv.writer(output, lineterminator='\n') writer.writerow(kwargs['map']) csvfile = path.join(tmp_dir,'res.csv') proc_par.append(csvfile) with open(csvfile, "w") as output: writer = csv.writer(output, lineterminator='\n') writer.writerow(kwargs['res']) csvfile = path.join(tmp_dir,'cg.csv') proc_par.append(csvfile) with open(csvfile, "w") as output: writer = csv.writer(output, lineterminator='\n') writer.writerow(kwargs['cg']) out_csv = path.join(tmp_dir,'biases.csv') proc_par.append(out_csv) subprocess.call (proc_par) biases_oneD = genfromtxt(out_csv, delimiter=',', dtype=float) #with open(out_csv, 'rb') as f: # reader = csv.reader(f) # biases_oneD = list(reader) return biases_oneD
def _gem_mapping(gem_index_path, fastq_path, out_map_path, gem_binary='gem-mapper', **kwargs): """ :param None focus: trims the sequence in the input FASTQ file according to a (start, end) position, or the name of a restriction enzyme. By default it uses the full sequence. :param 33 quality: set it to 'ignore' in order to speed-up the mapping """ gem_index_path = os.path.abspath(os.path.expanduser(gem_index_path)) fastq_path = os.path.abspath(os.path.expanduser(fastq_path)) out_map_path = os.path.abspath(os.path.expanduser(out_map_path)) nthreads = kwargs.get('nthreads' , 8) max_edit_distance = kwargs.get('max_edit_distance' , 0.04) mismatches = kwargs.get('mismatches' , 0.04) # check that we have the GEM binary: gem_binary = which(gem_binary) if not gem_binary: raise Exception('\n\nERROR: GEM binary not found, install it from:' '\nhttps://sourceforge.net/projects/gemlibrary/files/gem-library/Binary%20pre-release%202/' '\n - Download the GEM-binaries-Linux-x86_64-core_i3 if' 'have a recent computer, the ' 'GEM-binaries-Linux-x86_64-core_2 otherwise\n - ' 'Uncompress with "tar xjvf GEM-binaries-xxx.tbz2"\n - ' 'Copy the binary gem-mapper to /usr/local/bin/ for ' 'example (somewhere in your PATH).\n\nNOTE: GEM does ' 'not provide any binary for MAC-OS.') # mapping print 'TO GEM', fastq_path kgt = kwargs.get gem_cmd = [ gem_binary, '-I', gem_index_path, '-q' , kgt('q', 'offset-33' ), '-m' , kgt('m', str(max_edit_distance ) ), '-s' , kgt('s', kgt('strata-after-best', '0') ), '--allow-incomplete-strata' , kgt('allow-incomplete-strata', '0.00' ), '--granularity' , kgt('granularity', '10000' ), '--max-decoded-matches' , kgt('max-decoded-matches', kgt('d', '1')), '--min-decoded-strata' , kgt('min-decoded-strata', kgt('D', '0') ), '--min-insert-size' , kgt('min-insert-size', '0' ), '--max-insert-size' , kgt('max-insert-size', '0' ), '--min-matched-bases' , kgt('min-matched-bases', '0.8' ), '--gem-quality-threshold' , kgt('gem-quality-threshold', '26' ), '--max-big-indel-length' , kgt('max-big-indel-length', '15' ), '--mismatch-alphabet' , kgt('mismatch-alphabet', 'ACGT' ), '-E' , kgt('E', '0.30' ), '--max-extendable-matches' , kgt('max-extendable-matches', '20' ), '--max-extensions-per-match', kgt('max-extensions-per-match', '1' ), '-e' , kgt('e', str(mismatches) ), '-T' , str(nthreads), '-i' , fastq_path, '-o', out_map_path.replace('.map', '')] if 'paired-end-alignment' in kwargs or 'p' in kwargs: gem_cmd.append('--paired-end-alignment') if 'map-both-ends' in kwargs or 'b' in kwargs: gem_cmd.append('--map-both-ends') if 'fast-mapping' in kwargs: gem_cmd.append('--fast-mapping') if 'unique-mapping' in kwargs: gem_cmd.append('--unique-mapping') if 'unique-pairing' in kwargs: gem_cmd.append('--unique-pairing') # check kwargs for kw in kwargs: if not kw in ['nthreads', 'max_edit_distance', 'mismatches', 'max_reads_per_chunk', 'out_files', 'temp_dir', 'skip', 'q', 'm', 's', 'strata-after-best', 'allow-incomplete-strata', 'granularity', 'max-decoded-matches', 'min-decoded-strata', 'min-insert-size', 'max-insert-size', 'min-matched-bases', 'gem-quality-threshold', 'max-big-indel-length', 'mismatch-alphabet', 'E', 'max-extendable-matches', 'max-extensions-per-match', 'e', 'paired-end-alignment', 'p', 'map-both-ends', 'fast-mapping', 'unique-mapping', 'unique-pairing', 'suffix']: warn('WARNING: %s not in usual keywords, misspelled?' % kw) print ' '.join(gem_cmd) try: # check_call(gem_cmd, stdout=PIPE, stderr=PIPE) out, err = Popen(gem_cmd, stdout=PIPE, stderr=PIPE).communicate() except CalledProcessError as e: print out print err raise Exception(e.output)
def fast_fragment_mapping(mapper_index_path, fastq_path1, fastq_path2, r_enz, genome_seq, out_map, clean=True, get_nread=False, mapper_binary=None, mapper_params=None, samtools='samtools', **kwargs): """ Maps FASTQ reads to an indexed reference genome with the knowledge of the restriction enzyme used (fragment-based mapping). :param mapper_index_path: path to index file created from a reference genome using gem-index tool, bowtie2-build or hisat2-build :param fastq_path1: PATH to FASTQ file of read 1, either compressed or not. :param fastq_path2: PATH to FASTQ file of read 2, either compressed or not. :param out_map_dir: path to outfile tab separated format containing mapped read information. :param r_enz: name of the restriction enzyme used in the experiment e.g. HindIII. :param genome_seq: a dictionary generated by :func:`pyatdbit.parser.genome_parser.parse_fasta`. containing the genomic sequence :param False clean: remove intermediate files created in temp_dir :param False get_nread: returns a list of lists where each element contains a path and the number of reads processed :param 4 nthreads: number of threads to use for mapping (number of CPUs) :param /tmp temp_dir: important to change. Intermediate FASTQ files will be written there. :param gem-mapper mapper_binary: path to the binary mapper :param None mapper_params: extra parameters for the mapper :param samtools samtools: path to samtools binary. :returns: outfile with the intersected read pairs """ suffix = kwargs.get('suffix', '') suffix = ('_' * (suffix != '')) + suffix nthreads = kwargs.get('nthreads', 8) samtools = which(samtools) # check out folder if not os.path.isdir(os.path.dirname(os.path.abspath(out_map))): raise Exception( '\n\nERROR: Path to store the output does not exist.\n') temp_dir = os.path.abspath( os.path.expanduser(kwargs.get('temp_dir', gettempdir()))) gem_version = None # check that we have the GEM binary: gem_binary = mapper_binary or 'gem-mapper' gem_binary = which(gem_binary) if not gem_binary: raise Exception('\n\nERROR: GEM v3 binary not found, install it from:' '\nhttps://github.com/smarco/gem3-mapper' 'Copy the binary gem-mapper to /usr/local/bin/ for ' 'example (somewhere in your PATH).\n') try: out, err = Popen([gem_binary, '--version'], stdout=PIPE, stderr=STDOUT, universal_newlines=True).communicate() gem_version = int(out[1]) except ValueError as e: gem_version = 2 print('Falling to gem v2') if gem_version < 3: raise Exception('\n\nERROR: GEM v3 binary not found, install it from:' '\nhttps://github.com/smarco/gem3-mapper' 'Copy the binary gem-mapper to /usr/local/bin/ for ' 'example (somewhere in your PATH).\n') if mapper_params: kwargs.update(mapper_params) # create directories for rep in [temp_dir]: mkdir(rep) # check space fspace = int(get_free_space_mb(temp_dir, div=3)) if fspace < 200: warn('WARNING: only %d Gb left on tmp_dir: %s\n' % (fspace, temp_dir)) # iterative mapping base_name1 = os.path.split(fastq_path1)[-1].replace('.gz', '') base_name1 = '.'.join(base_name1.split('.')[:-1]) curr_map1, _ = transform_fastq(fastq_path1, mkstemp(prefix=base_name1 + '_', dir=temp_dir)[1], fastq=is_fastq(fastq_path1), nthreads=nthreads, light_storage=True) base_name2 = os.path.split(fastq_path2)[-1].replace('.gz', '') base_name2 = '.'.join(base_name2.split('.')[:-1]) curr_map2, count_fastq = transform_fastq(fastq_path2, mkstemp(prefix=base_name2 + '_', dir=temp_dir)[1], fastq=is_fastq(fastq_path1), nthreads=nthreads, light_storage=True) out_map_path = curr_map1 + '_frag%s.map' % (suffix) print('Mapping fragments of remaining reads...') _gem_mapping(mapper_index_path, curr_map1, out_map_path, fastq_path2=curr_map2, r_enz=r_enz, gem_binary=gem_binary, gem_version=gem_version, **kwargs) # clean if clean: print(' x removing GEM 3 input %s' % (curr_map1)) os.system('rm -f %s' % (curr_map1)) print(' x removing GEM 3 input %s' % (curr_map2)) os.system('rm -f %s' % (curr_map2)) #sort sam file os.system(samtools + ' sort -n -O SAM -@ %d -T %s -o %s %s' % (nthreads, out_map_path, out_map_path, out_map_path)) genome_lengths = dict((crm, len(genome_seq[crm])) for crm in genome_seq) frag_chunk = kwargs.get('frag_chunk', 100000) frags = map_re_sites(r_enz, genome_seq, frag_chunk=frag_chunk) if samtools and nthreads > 1: print('Splitting sam file') # headers for i in range(nthreads): os.system(samtools + ' view -H -O SAM %s > "%s_%d"' % (out_map_path, out_map_path, (i + 1))) chunk_lines = int( (count_fastq * 2.3) / nthreads) # estimate lines in sam with reads and frags os.system(samtools + ''' view -O SAM %s | awk -v n=%d -v FS="\\t" ' BEGIN { part=0; line=n } { if( line>=n && $1!=last_read ) {part++; line=1; print $0 >> "%s_"part } else { print $0 >> "%s_"part; line++; } last_read = $1; }' ''' % (out_map_path, chunk_lines, out_map_path, out_map_path)) if clean: print(' x removing tmp mapped %s' % out_map_path) os.system('rm -f %s' % (out_map_path)) print('Parsing results...') kwargs['nthreads'] = 1 procs = [] pool = mu.Pool(nthreads) for i in range(nthreads): frags_shared = copy.deepcopy(frags) procs.append( pool.apply_async(parse_gem_3c, args=('%s_%d' % (out_map_path, (i + 1)), '%s_parsed_%d' % (out_map_path, (i + 1)), copy.deepcopy(genome_lengths), frags_shared, False, True), kwds=kwargs)) #results.append('%s_parsed_%d' % (out_map_path,(i+1))) pool.close() pool.join() results = [proc.get() for proc in procs if proc.get()] if clean: for i in range(nthreads): print(' x removing tmp mapped %s_%d' % (out_map_path, (i + 1))) os.system('rm -f %s_%d' % (out_map_path, (i + 1))) #Final sort and merge nround = 0 while len(results) > 1: nround += 1 num_procs = min(nthreads, int(len(results) / 2)) pool = mu.Pool(num_procs) procs = [ pool.apply_async(merge_sort, (results.pop(0), results.pop(0), out_map_path + '_%d' % nround, i, True)) for i in range(num_procs) ] pool.close() pool.join() results = [proc.get() for proc in procs if proc.get()] map_out = open(out_map, 'w') tmp_reads_fh = open(results[0], 'r') for crm in genome_seq: map_out.write('# CRM %s\t%d\n' % (crm, len(genome_seq[crm]))) for read_line in tmp_reads_fh: read = read_line.split('\t') map_out.write('\t'.join([read[0]] + read[2:8] + read[9:])) map_out.close() if clean: print(' x removing tmp mapped %s' % results[0]) os.system('rm -f %s' % (results[0])) else: print('Parsing result...') parse_gem_3c(out_map_path, out_map, genome_lengths, frags, verbose=False, tmp_format=False, **kwargs) # clean if clean: print(' x removing tmp mapped %s' % out_map_path) os.system('rm -f %s' % (out_map_path)) if get_nread: return [(out_map, count_fastq)] return out_map
def check_options(opts): if opts.cfg: get_options_from_cfg(opts.cfg, opts) opts.gem_binary = which(opts.gem_binary) if not opts.gem_binary: raise Exception('\n\nERROR: GEM binary not found, install it from:' '\nhttps://sourceforge.net/projects/gemlibrary/files/gem-library/Binary%20pre-release%202/' '\n - Download the GEM-binaries-Linux-x86_64-core_i3 if' 'have a recent computer, the ' 'GEM-binaries-Linux-x86_64-core_2 otherwise\n - ' 'Uncompress with "tar xjvf GEM-binaries-xxx.tbz2"\n - ' 'Copy the binary gem-mapper to /usr/local/bin/ for ' 'example (somewhere in your PATH).\n\nNOTE: GEM does ' 'not provide any binary for MAC-OS.') # check RE name try: _ = RESTRICTION_ENZYMES[opts.renz] except KeyError: print ('\n\nERROR: restriction enzyme not found. Use one of:\n\n' + ' '.join(sorted(RESTRICTION_ENZYMES)) + '\n\n') raise KeyError() except AttributeError: pass # check skip if not path.exists(opts.workdir) and opts.skip: print ('WARNING: can use output files, found, not skipping...') opts.skip = False # number of cpus if opts.cpus == 0: opts.cpus = cpu_count() else: opts.cpus = min(opts.cpus, cpu_count()) # check paths if not path.exists(opts.index): raise IOError('ERROR: index file not found at ' + opts.index) if not path.exists(opts.fastq): raise IOError('ERROR: FASTQ file not found at ' + opts.fastq) # create tmp directory if not opts.tmp: opts.tmp = opts.workdir + '_tmp_r%d' % opts.read try: opts.windows = [[int(i) for i in win.split(':')] for win in opts.windows] except TypeError: pass mkdir(opts.workdir) # write log # if opts.mapping_only: log_format = '[MAPPING {} READ{}] %(message)s'.format(opts.fastq, opts.read) # else: # log_format = '[DEFAULT] %(message)s' # reset logging logging.getLogger().handlers = [] try: print 'Writing log to ' + path.join(opts.workdir, 'process.log') logging.basicConfig(level=logging.INFO, format=log_format, filename=path.join(opts.workdir, 'process.log'), filemode='aw') except IOError: logging.basicConfig(level=logging.DEBUG, format=log_format, filename=path.join(opts.workdir, 'process.log2'), filemode='aw') # to display log on stdout also logging.getLogger().addHandler(logging.StreamHandler()) # write version log vlog_path = path.join(opts.workdir, 'TADbit_and_dependencies_versions.log') dependencies = get_dependencies_version() if not path.exists(vlog_path) or open(vlog_path).readlines() != dependencies: logging.info('Writing versions of TADbit and dependencies') vlog = open(vlog_path, 'w') vlog.write(dependencies) vlog.close() # check GEM mapper extra options if opts.gem_param: opts.gem_param = dict([o.split(':') for o in opts.gem_param]) else: opts.gem_param = {} gem_valid_option = set(["granularity", "q", "quality-format", "gem-quality-threshold", "mismatch-alphabet", "m", "e", "min-matched-bases", "max-big-indel-length", "s", "strata-after-best", "fast-mapping", "unique-mapping", "d", "D", "allow-incomplete-strata", "max-decoded-matches", "min-decoded-strata", "p", "paired-end-alignment", "b", "map-both-ends", "min-insert-size", "max-insert-size", "E", "max-extendable-matches", "max-extensions-per-match", "unique-pairing"]) for k in opts.gem_param: if not k in gem_valid_option: raise NotImplementedError(('ERROR: option "%s" not a valid GEM option' 'or not suported by this tool.') % k) # check if job already run using md5 digestion of parameters if already_run(opts): exit('WARNING: exact same job already computed, see JOBs table above')
def check_options(opts): if not opts.mapper_binary: if opts.mapper == 'gem': opts.mapper_binary = 'gem-mapper' else: opts.mapper_binary = opts.mapper opts.mapper_binary = which(opts.mapper_binary) if not opts.mapper_binary: raise Exception( '\n\nERROR: Mapper binary not found, for GEM install it from:' '\nhttps://sourceforge.net/projects/gemlibrary/files/gem-library/Binary%20pre-release%202/' '\n - Download the GEM-binaries-Linux-x86_64-core_i3 if' 'have a recent computer, the ' 'GEM-binaries-Linux-x86_64-core_2 otherwise\n - ' 'Uncompress with "tar xjvf GEM-binaries-xxx.tbz2"\n - ' 'Copy the binary gem-mapper to /usr/local/bin/ for ' 'example (somewhere in your PATH).\n\nNOTE: GEM does ' 'not provide any binary for MAC-OS.') opts.gem_version = 0 if opts.mapper == 'gem': opts.gem_version = None try: out, _ = Popen([opts.mapper_binary, '--version'], stdout=PIPE, stderr=STDOUT, universal_newlines=True).communicate() opts.gem_version = int(out[1]) except ValueError as e: opts.gem_version = 2 print('Falling to gem v2') if opts.fast_fragment: if opts.gem_version < 3: raise Exception('ERROR: Fast fragment mapping needs GEM v3') if not opts.fastq2 or not path.exists(opts.fastq2): raise Exception( 'ERROR: Fast fragment mapping needs both fastq files. ' 'Please specify --fastq2') if opts.read != 0: raise Exception( 'ERROR: Fast fragment mapping needs to be specified with --read 0' ) if not opts.genome: raise Exception('ERROR: Fast fragment mapping needs ' 'the genome parameter.') # check RE name if opts.renz == ['CHECK']: print('\nSearching for most probable restriction enzyme in file: %s' % (opts.fastq)) try: pat, enz, pv = identify_re(opts.fastq, nreads=100000) print(' -> Most probable digested site: %s (pv: %f)' % (pat, pv)) print(' -> Enzymes matching: %s' % (', '.join(enz))) except ValueError: print(' -> Nothing found...') exit() for n, renz in enumerate(opts.renz): if renz == 'NONE': opts.renz[n] = None continue try: _ = RESTRICTION_ENZYMES[renz] except KeyError: print('\n\nERROR: restriction enzyme %s not found.' % (renz) + 'Use one of:\n\n' + ' '.join(sorted(RESTRICTION_ENZYMES)) + '\n\n') raise KeyError() except AttributeError: pass # check skip if not path.exists(opts.workdir) and opts.skip: print('WARNING: can use output files, found, not skipping...') opts.skip = False # number of cpus if opts.cpus == 0: opts.cpus = cpu_count() else: opts.cpus = min(opts.cpus, cpu_count()) # check paths if opts.mapper == 'gem' and not path.exists(opts.index): raise IOError('ERROR: index file not found at ' + opts.index) if not path.exists(opts.fastq): raise IOError('ERROR: FASTQ file not found at ' + opts.fastq) if not is_fastq(opts.fastq): raise IOError( ('ERROR: FASTQ file %s wrong format, check') % (opts.fastq)) try: opts.windows = [[int(i) for i in win.split(':')] for win in opts.windows] except TypeError: pass mkdir(opts.workdir) # write log # if opts.mapping_only: log_format = '[MAPPING {} READ{}] %(message)s'.format( opts.fastq, opts.read) # else: # log_format = '[DEFAULT] %(message)s' # reset logging logging.getLogger().handlers = [] try: print('Writing log to ' + path.join(opts.workdir, 'process.log')) logging.basicConfig(level=logging.INFO, format=log_format, filename=path.join(opts.workdir, 'process.log'), filemode='a+') except IOError: logging.basicConfig(level=logging.DEBUG, format=log_format, filename=path.join(opts.workdir, 'process.log2'), filemode='a+') # to display log on stdout also logging.getLogger().addHandler(logging.StreamHandler()) # write version log vlog_path = path.join(opts.workdir, 'TADbit_and_dependencies_versions.log') dependencies = get_dependencies_version() if not path.exists( vlog_path) or open(vlog_path).readlines() != dependencies: logging.info('Writing versions of TADbit and dependencies') vlog = open(vlog_path, 'w') vlog.write(dependencies) vlog.close() # check mapper extra options if opts.mapper_param: if (len(opts.mapper_param) == 1 and ('-' in opts.mapper_param[0] or '--' in opts.mapper_param[0])): # Single string surrounded by quotes opts.mapper_param = opts.mapper_param[0].split() else: opts.mapper_param = dict([o.split(':') for o in opts.mapper_param]) else: opts.mapper_param = {} if opts.mapper == 'gem' and opts.gem_version < 3: gem_valid_option = set([ "granularity", "q", "quality-format", "gem-quality-threshold", "mismatch-alphabet", "m", "e", "min-matched-bases", "max-big-indel-length", "s", "strata-after-best", "fast-mapping", "unique-mapping", "d", "D", "allow-incomplete-strata", "max-decoded-matches", "min-decoded-strata", "p", "paired-end-alignment", "b", "map-both-ends", "min-insert-size", "max-insert-size", "E", "max-extendable-matches", "max-extensions-per-match", "unique-pairing" ]) for k in opts.mapper_param: if not k in gem_valid_option: raise NotImplementedError( ('ERROR: option "%s" not a valid GEM option' 'or not suported by this tool.') % k) # create empty DB if don't exists dbpath = path.join(opts.workdir, 'trace.db') open(dbpath, 'a').close() # for lustre file system.... if 'tmpdb' in opts and opts.tmpdb: dbdir = opts.tmpdb # tmp file dbfile = 'trace_%s' % (''.join( [ascii_letters[int(random() * 52)] for _ in range(10)])) opts.tmpdb = path.join(dbdir, dbfile) try: copyfile(path.join(opts.workdir, 'trace.db'), opts.tmpdb) except IOError: pass # check if job already run using md5 digestion of parameters if already_run(opts): if 'tmpdb' in opts and opts.tmpdb: remove(path.join(dbdir, dbfile)) exit('WARNING: exact same job already computed, see JOBs table above')
def run(opts): check_options(opts) samtools = which(opts.samtools) launch_time = time.localtime() param_hash = digest_parameters(opts) reso1 = reso2 = None if opts.bam1: mreads1 = path.realpath(opts.bam1) biases1 = opts.biases1 else: biases1, mreads1, reso1 = load_parameters_fromdb( opts.workdir1, opts.jobid1, opts, opts.tmpdb1) mreads1 = path.join(opts.workdir1, mreads1) try: biases1 = path.join(opts.workdir1, biases1) except AttributeError: biases1 = None if opts.bam2: mreads2 = path.realpath(opts.bam2) biases2 = opts.biases2 else: biases2, mreads2, reso2 = load_parameters_fromdb( opts.workdir2, opts.jobid2, opts, opts.tmpdb2) mreads2 = path.join(opts.workdir2, mreads2) try: biases2 = path.join(opts.workdir2, biases2) except AttributeError: biases2 = None filter_exclude = opts.filter if reso1 != reso2: raise Exception('ERROR: differing resolutions between experiments to ' 'be merged') mkdir(path.join(opts.workdir, '00_merge')) if not opts.skip_comparison: printime(' - loading first sample %s' % (mreads1)) hic_data1 = load_hic_data_from_bam(mreads1, opts.reso, biases=biases1, tmpdir=path.join(opts.workdir, '00_merge'), ncpus=opts.cpus, filter_exclude=filter_exclude) printime(' - loading second sample %s' % (mreads2)) hic_data2 = load_hic_data_from_bam(mreads2, opts.reso, biases=biases2, tmpdir=path.join(opts.workdir, '00_merge'), ncpus=opts.cpus, filter_exclude=filter_exclude) decay_corr_dat = path.join(opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.txt' % (opts.reso, param_hash)) decay_corr_fig = path.join(opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.png' % (opts.reso, param_hash)) eigen_corr_dat = path.join(opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.txt' % (opts.reso, param_hash)) eigen_corr_fig = path.join(opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.png' % (opts.reso, param_hash)) printime(' - comparing experiments') printime(' => correlation between equidistant loci') corr, _, scc, std, bads = correlate_matrices( hic_data1, hic_data2, normalized=opts.norm, remove_bad_columns=True, savefig=decay_corr_fig, savedata=decay_corr_dat, get_bads=True) print ' - correlation score (SCC): %.4f (+- %.7f)' % (scc, std) printime(' => correlation between eigenvectors') eig_corr = eig_correlate_matrices(hic_data1, hic_data2, normalized=opts.norm, remove_bad_columns=True, nvect=6, savefig=eigen_corr_fig, savedata=eigen_corr_dat) printime(' => reproducibility score') reprod = get_reproducibility(hic_data1, hic_data2, num_evec=20, normalized=opts.norm, verbose=False, remove_bad_columns=True) print ' - reproducibility score: %.4f' % (reprod) ncols = len(hic_data1) else: ncols = 0 decay_corr_dat = 'None' decay_corr_fig = 'None' eigen_corr_dat = 'None' eigen_corr_fig = 'None' corr = eig_corr = 0 bads = {} # merge inputs mkdir(path.join(opts.workdir, '03_filtered_reads')) outbam = path.join(opts.workdir, '03_filtered_reads', 'intersection_%s.bam' % (param_hash)) printime(' - Mergeing experiments') system(samtools + ' merge -@ %d %s %s %s' % (opts.cpus, outbam, mreads1, mreads2)) printime(' - Indexing new BAM file') # check samtools version number and modify command line version = LooseVersion([l.split()[1] for l in Popen(samtools, stderr=PIPE).communicate()[1].split('\n') if 'Version' in l][0]) if version >= LooseVersion('1.3.1'): system(samtools + ' index -@ %d %s' % (opts.cpus, outbam)) else: system(samtools + ' index %s' % (outbam)) finish_time = time.localtime() save_to_db (opts, mreads1, mreads2, decay_corr_dat, decay_corr_fig, len(bads.keys()), ncols, scc, std, reprod, eigen_corr_dat, eigen_corr_fig, outbam, corr, eig_corr, biases1, biases2, launch_time, finish_time) printime('\nDone.')
def gem_mapping(gem_index_path, fastq_path, out_map_path, gem_binary='gem-mapper', **kwargs): """ :param None focus: trims the sequence in the input FASTQ file according to a (start, end) position, or the name of a restriction enzyme. By default it uses the full sequence. :param 33 quality: set it to 'ignore' in order to speed-up the mapping """ gem_index_path = os.path.abspath(os.path.expanduser(gem_index_path)) fastq_path = os.path.abspath(os.path.expanduser(fastq_path)) out_map_path = os.path.abspath(os.path.expanduser(out_map_path)) nthreads = kwargs.get('nthreads' , 8) max_edit_distance = kwargs.get('max_edit_distance' , 0.04) mismatches = kwargs.get('mismatches' , 0.04) # check that we have the GEM binary: gem_binary = which(gem_binary) if not gem_binary: raise Exception('\n\nERROR: GEM binary not found, install it from:' '\nhttps://sourceforge.net/projects/gemlibrary/files/gem-library/Binary%20pre-release%202/' '\n - Download the GEM-binaries-Linux-x86_64-core_i3 if' 'have a recent computer, the ' 'GEM-binaries-Linux-x86_64-core_2 otherwise\n - ' 'Uncompress with "tar xjvf GEM-binaries-xxx.tbz2"\n - ' 'Copy the binary gem-mapper to /usr/local/bin/ for ' 'example (somewhere in your PATH).\n\nNOTE: GEM does ' 'not provide any binary for MAC-OS.') # mapping print 'TO GEM', fastq_path kgt = kwargs.get gem_cmd = [ gem_binary, '-I', gem_index_path, '-q' , kgt('q', 'offset-33' ), '-m' , kgt('m', str(max_edit_distance )), '-s' , kgt('s', kgt('strata-after-best', '0' )), '--allow-incomplete-strata' , kgt('allow-incomplete-strata', '0.00' ), '--granularity' , kgt('granularity', '10000' ), '--max-decoded-matches' , kgt('max-decoded-matches', kgt('d', '1' )), '--min-decoded-strata' , kgt('min-decoded-strata', kgt('D', '0' )), '--min-insert-size' , kgt('min-insert-size', '0' ), '--max-insert-size' , kgt('max-insert-size', '0' ), '--min-matched-bases' , kgt('min-matched-bases', '0.8' ), '--gem-quality-threshold' , kgt('gem-quality-threshold', '26' ), '--max-big-indel-length' , kgt('max-big-indel-length', '15' ), '--mismatch-alphabet' , kgt('mismatch-alphabet', 'ACGT' ), '-E' , kgt('E', '0.30' ), '--max-extendable-matches' , kgt('max-extendable-matches', '20' ), '--max-extensions-per-match', kgt('max-extensions-per-match', '1' ), '-e' , kgt('e', str(mismatches )), '-T' , str(nthreads), '-i' , fastq_path, '-o', out_map_path.replace('.map', '')] if 'paired-end-alignment' in kwargs or 'p' in kwargs: gem_cmd.append('--paired-end-alignment') if 'map-both-ends' in kwargs or 'b' in kwargs: gem_cmd.append('--map-both-ends') if 'fast-mapping' in kwargs: gem_cmd.append('--fast-mapping') if 'unique-mapping' in kwargs: gem_cmd.append('--unique-mapping') if 'unique-pairing' in kwargs: gem_cmd.append('--unique-pairing') # check kwargs for kw in kwargs: if not kw in ['nthreads', 'max_edit_distance', 'mismatches', 'max_reads_per_chunk', 'out_files', 'temp_dir', 'skip', 'q', 'm', 's', 'strata-after-best', 'allow-incomplete-strata', 'granularity', 'max-decoded-matches', 'min-decoded-strata', 'min-insert-size', 'max-insert-size', 'min-matched-bases', 'gem-quality-threshold', 'max-big-indel-length', 'mismatch-alphabet', 'E', 'max-extendable-matches', 'max-extensions-per-match', 'e', 'paired-end-alignment', 'p', 'map-both-ends', 'fast-mapping', 'unique-mapping', 'unique-pairing', 'suffix']: warn('WARNING: %s not in usual keywords, misspelled?' % kw) print ' '.join(gem_cmd) try: # check_call(gem_cmd, stdout=PIPE, stderr=PIPE) out, err = Popen(gem_cmd, stdout=PIPE, stderr=PIPE).communicate() except CalledProcessError as e: print out print err raise Exception(e.output)
def full_mapping(mapper_index_path, fastq_path, out_map_dir, mapper='gem', r_enz=None, frag_map=True, min_seq_len=15, windows=None, add_site=True, clean=False, get_nread=False, mapper_binary=None, mapper_params=None, **kwargs): """ Maps FASTQ reads to an indexed reference genome. Mapping can be done either without knowledge of the restriction enzyme used, or for experiments performed without one, like Micro-C (iterative mapping), or using the ligation sites created from the digested ends (fragment-based mapping). :param mapper_index_path: path to index file created from a reference genome using gem-index tool or bowtie2-build :param fastq_path: PATH to FASTQ file, either compressed or not. :param out_map_dir: path to a directory where to store mapped reads in MAP format . :param None r_enz: name of the restriction enzyme used in the experiment e.g. HindIII. This is optional if frag_map option is False :param True frag_map: two step mapper, first full length is mapped, then remaining, unmapped reads, are divided into restriction-enzyme fragments andeach is mapped. :param True add_site: when splitting the sequence by ligated sites found, removes the ligation site, and put back the original RE site. :param 15 min_seq_len: minimum size of a fragment to map :param None windows: tuple of ranges for beginning and end of the mapping. This parameter allows to do classical iterative mapping, e.g. windows=((1,25),(1,30),(1,35),(1,40),(1,45),(1,50)) A unique window can also be passed, for trimming, like this: windows=((1,101),) :param False clean: remove intermediate files created in temp_dir :param 4 nthreads: number of threads to use for mapping (number of CPUs) :param 0.04 max_edit_distance: The maximum number of edit operations allowed while verifying candidate matches by dynamic programming. :param 0.04 mismatches: The maximum number of nucleotide substitutions allowed while mapping each k-mer. It is always guaranteed that, however other options are chosen, all the matches up to the specified number of substitutions will be found by the program. :param /tmp temp_dir: important to change. Intermediate FASTQ files will be written there. :param False get_nreads: returns a list of lists where each element contains a path and the number of reads processed :param gem-mapper mapper_binary: path to the binary mapper :param None mapper_params: extra parameters for the mapper :returns: a list of paths to generated outfiles. To be passed to :func:`pytadbit.parsers.map_parser.parse_map` """ skip = kwargs.get('skip', False) suffix = kwargs.get('suffix', '') suffix = ('_' * (suffix != '')) + suffix nthreads = kwargs.get('nthreads', 8) outfiles = [] temp_dir = os.path.abspath( os.path.expanduser(kwargs.get('temp_dir', gettempdir()))) if mapper == 'gem': gem_version = None # check that we have the GEM binary: gem_binary = mapper_binary or 'gem-mapper' gem_binary = which(gem_binary) if not gem_binary: raise Exception( '\n\nERROR: GEM binary not found, install it from:' '\nhttps://sourceforge.net/projects/gemlibrary/files/gem-library/Binary%20pre-release%202/' '\n - Download the GEM-binaries-Linux-x86_64-core_i3 if' 'have a recent computer, the ' 'GEM-binaries-Linux-x86_64-core_2 otherwise\n - ' 'Uncompress with "tar xjvf GEM-binaries-xxx.tbz2"\n - ' 'Copy the binary gem-mapper to /usr/local/bin/ for ' 'example (somewhere in your PATH).\n\nNOTE: GEM does ' 'not provide any binary for MAC-OS.') try: out, err = Popen([gem_binary, '--version'], stdout=PIPE, stderr=STDOUT, universal_newlines=True).communicate() gem_version = int(out[1]) except ValueError as e: gem_version = 2 print('Falling to gem v2') if mapper_params and isinstance(mapper_params, dict): kwargs.update(mapper_params) # create directories for rep in [temp_dir, out_map_dir]: mkdir(rep) # check space fspace = int(get_free_space_mb(temp_dir, div=3)) if fspace < 200: warn('WARNING: only %d Gb left on tmp_dir: %s\n' % (fspace, temp_dir)) # iterative mapping base_name = os.path.split(fastq_path)[-1].replace('.gz', '') base_name = '.'.join(base_name.split('.')[:-1]) input_reads = fastq_path if windows is None: light_storage = True windows = (None, ) elif isinstance(windows[0], int): # if windows starts at zero we do not need to store all the sequence # otherwise we need it because sequence can be trimmed two times # in fragment based mapping light_storage = True if not windows[0] else False windows = [tuple(windows)] else: # ensure that each element is a tuple, not a list windows = [tuple(win) for win in windows] # in this case we will need to keep the information about original # sequence at any point, light storage is thus not possible. light_storage = False for win in windows: # Prepare the FASTQ file and iterate over them curr_map, counter = transform_fastq(input_reads, mkstemp(prefix=base_name + '_', dir=temp_dir)[1], fastq=is_fastq(input_reads), min_seq_len=min_seq_len, trim=win, skip=skip, nthreads=nthreads, light_storage=light_storage) # clean if input_reads != fastq_path and clean: print(' x removing original input %s' % input_reads) os.system('rm -f %s' % (input_reads)) # First mapping, full length if not win: beg, end = 1, 'end' else: beg, end = win out_map_path = curr_map + '_full_%s-%s%s.map' % (beg, end, suffix) if end: print('Mapping reads in window %s-%s%s...' % (beg, end, suffix)) else: print('Mapping full reads...', curr_map) if not skip: if mapper == 'gem': _gem_mapping(mapper_index_path, curr_map, out_map_path, gem_binary=gem_binary, gem_version=gem_version, gem_params=mapper_params, **kwargs) # parse map file to extract not uniquely mapped reads print('Parsing result...') if gem_version >= 3: _sam_filter( out_map_path, curr_map, curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix), os.path.join( out_map_dir, base_name + '_full_%s-%s%s.map' % (beg, end, suffix))) else: _gem_filter( out_map_path, curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix), os.path.join( out_map_dir, base_name + '_full_%s-%s%s.map' % (beg, end, suffix))) elif mapper == 'bowtie2' or mapper == 'hisat2': _bowtie2_mapping(mapper_index_path, curr_map, out_map_path, bowtie2_binary=(mapper_binary if mapper_binary else mapper), bowtie2_params=mapper_params, **kwargs) # parse map file to extract not uniquely mapped reads print('Parsing result...') _sam_filter( out_map_path, curr_map, curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix), os.path.join( out_map_dir, base_name + '_full_%s-%s%s.map' % (beg, end, suffix))) else: raise Exception('ERROR: unknown mapper.') # clean if clean: print(' x removing %s input %s' % (mapper.upper(), curr_map)) os.system('rm -f %s' % (curr_map)) print(' x removing map %s' % out_map_path) os.system('rm -f %s' % (out_map_path)) # for next round, we will use remaining unmapped reads input_reads = curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix) outfiles.append((os.path.join( out_map_dir, base_name + '_full_%s-%s%s.map' % (beg, end, suffix)), counter)) # map again splitting unmapped reads into RE fragments # (no need to trim this time) if frag_map: if not r_enz: raise Exception('ERROR: need enzyme name to fragment.') frag_map, counter = transform_fastq(input_reads, mkstemp(prefix=base_name + '_', dir=temp_dir)[1], min_seq_len=min_seq_len, trim=win, fastq=False, r_enz=r_enz, add_site=add_site, skip=skip, nthreads=nthreads, light_storage=light_storage) # clean if clean: print(' x removing pre-%s input %s' % (mapper.upper(), input_reads)) os.system('rm -f %s' % (input_reads)) if not win: beg, end = 1, 'end' else: beg, end = win out_map_path = frag_map + '_frag_%s-%s%s.map' % (beg, end, suffix) if not skip: if mapper == 'gem': print('Mapping fragments of remaining reads...') _gem_mapping(mapper_index_path, frag_map, out_map_path, gem_binary=gem_binary, gem_version=gem_version, **kwargs) print('Parsing result...') # check if output is sam format for gem3 if gem_version >= 3: _sam_filter( out_map_path, frag_map, curr_map + '_fail%s.map' % (suffix), os.path.join( out_map_dir, base_name + '_frag_%s-%s%s.map' % (beg, end, suffix))) else: _gem_filter( out_map_path, curr_map + '_fail%s.map' % (suffix), os.path.join( out_map_dir, base_name + '_frag_%s-%s%s.map' % (beg, end, suffix))) elif mapper == 'bowtie2' or mapper == 'hisat2': print('Mapping fragments of remaining reads...') _bowtie2_mapping(mapper_index_path, frag_map, out_map_path, bowtie2_binary=(mapper_binary if mapper_binary else mapper), bowtie2_params=mapper_params, **kwargs) print('Parsing result...') _sam_filter( out_map_path, frag_map, curr_map + '_fail%s.map' % (suffix), os.path.join( out_map_dir, base_name + '_frag_%s-%s%s.map' % (beg, end, suffix))) else: raise Exception('ERROR: unknown mapper.') # clean if clean: print(' x removing %s input %s' % (mapper.upper(), frag_map)) os.system('rm -f %s' % (frag_map)) print(' x removing failed to map ' + curr_map + '_fail%s.map' % (suffix)) os.system('rm -f %s' % (curr_map + '_fail%s.map' % (suffix))) print(' x removing tmp mapped %s' % out_map_path) os.system('rm -f %s' % (out_map_path)) outfiles.append((os.path.join( out_map_dir, base_name + '_frag_%s-%s%s.map' % (beg, end, suffix)), counter)) if clean: os.system('rm -rf %s' % (temp_dir)) if get_nread: return outfiles return [out for out, _ in outfiles]
def bed2D_to_BAMhic(infile, valid, ncpus, outbam, frmt, masked=None, samtools='samtools'): """ function adapted from Enrique Vidal <*****@*****.**> scipt to convert 2D beds into compressed BAM format. Gets the *_both_filled_map.tsv contacts from TADbit (and the corresponding filter files) and outputs a modified indexed BAM with the following fields: - read ID - filtering flag (see codes in header) - chromosome ID of the first pair of the contact - genomic position of the first pair of the contact - MAPQ set to 0 - pseudo CIGAR with sequence length and info about current copy (P: first copy, S: second copy) - chromosome ID of the second pair of the contact - genomic position of the second pair of the contact - mapped length of the second pair of the contact - sequence is missing (*) - quality is missing (*) - TC tag indicating single (1) or multi contact (3 6 ... number being the number of times a given sequenced fragment is involved in a pairwise contact) - S1 and S2 tags are the strand orientation of the left and right read-end Each pair of contacts produces two lines in the output BAM """ samtools = which(samtools) if not samtools: raise Exception('ERROR: samtools is needed to save a compressed ' 'version of the results. Check ' 'http://samtools.sourceforge.net/ \n') # define filter codes filter_keys = OrderedDict() for k in MASKED: filter_keys[MASKED[k]['name'].replace(' ', '-')] = 2**(k - 1) output = '' # write header output += ("\t".join(("@HD", "VN:1.5", "SO:queryname")) + '\n') fhandler = open(infile) line = fhandler.next() # chromosome lengths pos_fh = 0 while line.startswith('#'): (_, _, cr, ln) = line.replace("\t", " ").strip().split(" ") output += ("\t".join(("@SQ", "SN:" + cr, "LN:" + ln)) + '\n') pos_fh += len(line) line = fhandler.next() # filter codes for i in filter_keys: output += ("\t".join( ("@CO", "filter:" + i, "flag:" + str(filter_keys[i]))) + '\n') # tags output += ("\t".join(( "@CO", "TC:i", "Number of time a sequenced fragment is involved in a pairwise contact\n" ))) output += ("\t".join( ("@CO", ("Each read is duplicated: once starting with the " "left read-end, once with the right read-end\n")))) output += ("\t".join( ("@CO", (" the order of RE sites and strands changes consequently " "depending on which read-end comes first (" "when right end is first: E3 E4 E1 E2)\n")))) output += ("\t".join( ("@CO", (" CIGAR code contains the length of the " "1st read-end mapped and 'P' or 'S' " "if the copy is the first or the second\n")))) output += ("\t".join( ("@CO", "E1:i", "Position of the left RE site of 1st read-end\n"))) output += ("\t".join( ("@CO", "E2:i", "Position of the right RE site of 1st read-end\n"))) output += ("\t".join( ("@CO", "E3:i", "Position of the left RE site of 2nd read-end\n"))) output += ("\t".join( ("@CO", "E4:i", "Position of the right RE site of 2nd read-end\n"))) output += ("\t".join( ("@CO", "S1:i", "Strand of the 1st read-end (1: positive, 0: negative)\n"))) output += ("\t".join( ("@CO", "S2:i", "Strand of the 2nd read-end (1: positive, 0: negative)\n"))) # open and init filter files if not valid: filter_line, filter_handler = get_filters(infile, masked) fhandler.seek(pos_fh) # check samtools version number and modify command line version = LooseVersion([ l.split()[1] for l in Popen(samtools, stderr=PIPE).communicate()[1].split('\n') if 'Version' in l ][0]) pre = '-o' if version >= LooseVersion('1.3') else '' proc = Popen( samtools + ' view -Shb -@ %d - | samtools sort -@ %d - %s %s' % (ncpus, ncpus, pre, outbam + '.bam' if version >= LooseVersion('1.3') else ''), # in new version '.bam' is no longer added shell=True, stdin=PIPE) proc.stdin.write(output) if frmt == 'mid': map2sam = _map2sam_mid elif frmt == 'long': map2sam = _map2sam_long else: map2sam = _map2sam_short if valid: for line in fhandler: flag = 0 # get output in sam format proc.stdin.write(map2sam(line, flag)) else: for line in fhandler: flag = 0 # check if read matches any filter rid = line.split("\t")[0] for i in filter_line: if filter_line[i] == rid: flag += filter_keys[i] try: filter_line[i] = filter_handler[i].next().strip() except StopIteration: pass # get output in sam format proc.stdin.write(map2sam(line, flag)) proc.stdin.close() proc.wait() # Index BAM _ = Popen(samtools + ' index %s.bam' % (outbam), shell=True).communicate() # close file handlers fhandler.close() if not valid: for i in filter_handler: filter_handler[i].close()
def generate_BAM(infile, valid, ncpus, outbam, frmt): # define filter codes filter_keys = OrderedDict() filter_keys['self-circle'] = 2 ** 0 filter_keys['dangling-end'] = 2 ** 1 filter_keys['error'] = 2 ** 2 filter_keys['extra-dangling-end'] = 2 ** 3 filter_keys['too-close-from-RES'] = 2 ** 4 filter_keys['too-short'] = 2 ** 5 filter_keys['too-large'] = 2 ** 6 filter_keys['over-represented'] = 2 ** 7 filter_keys['duplicated'] = 2 ** 8 filter_keys['random-breaks'] = 2 ** 9 filter_keys['trans-chromosomic'] = 2 ** 10 output = '' # write header output += ("\t".join(("@HD" ,"VN:1.5", "SO:queryname")) + '\n') fhandler = open(infile) line = fhandler.next() # chromosome lengths pos_fh = 0 while line.startswith('#'): (_, _, cr, ln) = line.replace("\t", " ").strip().split(" ") output += ("\t".join(("@SQ", "SN:" + cr, "LN:" + ln)) + '\n') pos_fh += len(line) line = fhandler.next() # filter codes for i in filter_keys: output += ("\t".join(("@CO", "filter:" + i, "flag:" + str(filter_keys[i]))) + '\n') # tags output += ("\t".join(("@CO" ,"TC:i", "Number of time a sequenced fragment is involved in a pairwise contact\n"))) output += ("\t".join(("@CO" ,("Each read is duplicated: once starting with the " "left read-end, once with the right read-end\n")))) output += ("\t".join(("@CO" , (" the order of RE sites and strands changes consequently " "depending on which read-end comes first (" "when right end is first: E3 E4 E1 E2)\n")))) output += ("\t".join(("@CO" ,(" CIGAR code contains the length of the " "1st read-end mapped and 'P' or 'S' " "if the copy is the first or the second\n")))) output += ("\t".join(("@CO" ,"E1:i", "Position of the left RE site of 1st read-end\n"))) output += ("\t".join(("@CO" ,"E2:i", "Position of the right RE site of 1st read-end\n"))) output += ("\t".join(("@CO" ,"E3:i", "Position of the left RE site of 2nd read-end\n"))) output += ("\t".join(("@CO" ,"E4:i", "Position of the right RE site of 2nd read-end\n"))) output += ("\t".join(("@CO" ,"S1:i", "Strand of the 1st read-end (1: positive, 0: negative)\n"))) output += ("\t".join(("@CO" ,"S2:i", "Strand of the 2nd read-end (1: positive, 0: negative)\n"))) # open and init filter files if not valid: filter_line, filter_handler = get_filters(infile) fhandler.seek(pos_fh) samtools = which('samtools') version = LooseVersion([l.split()[1] for l in Popen(samtools, stderr=PIPE).communicate()[1].split('\n') if 'Version' in l][0]) pre = '-o' if version >= LooseVersion('1.3') else '' proc = Popen('samtools view -Shb -@ %d - | samtools sort -@ %d - %s %s' % ( ncpus, ncpus, pre, outbam + '.bam' if version >= LooseVersion('1.3') else ''), # in new version '.bam' is no longer added shell=True, stdin=PIPE) proc.stdin.write(output) if frmt == 'mid': map2sam = _map2sam_mid elif frmt == 'long': map2sam = _map2sam_long else: map2sam = _map2sam_short if valid: for line in fhandler: flag = 0 # get output in sam format proc.stdin.write(map2sam(line, flag)) else: for line in fhandler: flag = 0 # check if read matches any filter rid = line.split("\t")[0] for i in filter_line: if filter_line[i] == rid: flag += filter_keys[i] try: filter_line[i] = filter_handler[i].next().strip() except StopIteration: pass # get output in sam format proc.stdin.write(map2sam(line, flag)) proc.stdin.close() proc.wait() # Index BAM _ = Popen('samtools index %s.bam' % (outbam), shell=True).communicate() # close file handlers fhandler.close() if not valid: for i in filter_handler: filter_handler[i].close()
def generate_BAM(infile, valid, ncpus, outbam, frmt): # define filter codes filter_keys = OrderedDict() filter_keys['self-circle'] = 2**0 filter_keys['dangling-end'] = 2**1 filter_keys['error'] = 2**2 filter_keys['extra-dangling-end'] = 2**3 filter_keys['too-close-from-RES'] = 2**4 filter_keys['too-short'] = 2**5 filter_keys['too-large'] = 2**6 filter_keys['over-represented'] = 2**7 filter_keys['duplicated'] = 2**8 filter_keys['random-breaks'] = 2**9 filter_keys['trans-chromosomic'] = 2**10 output = '' # write header output += ("\t".join(("@HD", "VN:1.5", "SO:queryname")) + '\n') fhandler = open(infile) line = fhandler.next() # chromosome lengths pos_fh = 0 while line.startswith('#'): (_, _, cr, ln) = line.replace("\t", " ").strip().split(" ") output += ("\t".join(("@SQ", "SN:" + cr, "LN:" + ln)) + '\n') pos_fh += len(line) line = fhandler.next() # filter codes for i in filter_keys: output += ("\t".join( ("@CO", "filter:" + i, "flag:" + str(filter_keys[i]))) + '\n') # tags output += ("\t".join(( "@CO", "TC:i", "Number of time a sequenced fragment is involved in a pairwise contact\n" ))) output += ("\t".join( ("@CO", ("Each read is duplicated: once starting with the " "left read-end, once with the right read-end\n")))) output += ("\t".join( ("@CO", (" the order of RE sites and strands changes consequently " "depending on which read-end comes first (" "when right end is first: E3 E4 E1 E2)\n")))) output += ("\t".join( ("@CO", (" CIGAR code contains the length of the " "1st read-end mapped and 'P' or 'S' " "if the copy is the first or the second\n")))) output += ("\t".join( ("@CO", "E1:i", "Position of the left RE site of 1st read-end\n"))) output += ("\t".join( ("@CO", "E2:i", "Position of the right RE site of 1st read-end\n"))) output += ("\t".join( ("@CO", "E3:i", "Position of the left RE site of 2nd read-end\n"))) output += ("\t".join( ("@CO", "E4:i", "Position of the right RE site of 2nd read-end\n"))) output += ("\t".join( ("@CO", "S1:i", "Strand of the 1st read-end (1: positive, 0: negative)\n"))) output += ("\t".join( ("@CO", "S2:i", "Strand of the 2nd read-end (1: positive, 0: negative)\n"))) # open and init filter files if not valid: filter_line, filter_handler = get_filters(infile) fhandler.seek(pos_fh) samtools = which('samtools') version = LooseVersion([ l.split()[1] for l in Popen(samtools, stderr=PIPE).communicate()[1].split('\n') if 'Version' in l ][0]) pre = '-o' if version >= LooseVersion('1.3') else '' proc = Popen( 'samtools view -Shb -@ %d - | samtools sort -@ %d - %s %s' % (ncpus, ncpus, pre, outbam + '.bam' if version >= LooseVersion('1.3') else ''), # in new version '.bam' is no longer added shell=True, stdin=PIPE) proc.stdin.write(output) if frmt == 'mid': map2sam = _map2sam_mid elif frmt == 'long': map2sam = _map2sam_long else: map2sam = _map2sam_short if valid: for line in fhandler: flag = 0 # get output in sam format proc.stdin.write(map2sam(line, flag)) else: for line in fhandler: flag = 0 # check if read matches any filter rid = line.split("\t")[0] for i in filter_line: if filter_line[i] == rid: flag += filter_keys[i] try: filter_line[i] = filter_handler[i].next().strip() except StopIteration: pass # get output in sam format proc.stdin.write(map2sam(line, flag)) proc.stdin.close() proc.wait() # Index BAM _ = Popen('samtools index %s.bam' % (outbam), shell=True).communicate() # close file handlers fhandler.close() if not valid: for i in filter_handler: filter_handler[i].close()
def create_BAMhic(hic, ncpus, outbam, chromosomes, reso, masked=None, samtools='samtools'): """ function adapted from Enrique Vidal <*****@*****.**> scipt to convert 2D beds into compressed BAM format. Gets the *_both_filled_map.tsv contacts from TADbit (and the corresponding filter files) and outputs a modified indexed BAM with the following fields: - read ID - filtering flag (see codes in header) - chromosome ID of the first pair of the contact - genomic position of the first pair of the contact - MAPQ set to 0 - pseudo CIGAR with sequence length and info about current copy (P: first copy, S: second copy) - chromosome ID of the second pair of the contact - genomic position of the second pair of the contact - mapped length of the second pair of the contact - sequence is missing (*) - quality is missing (*) - TC tag indicating single (1) or multi contact (3 6 ... number being the number of times a given sequenced fragment is involved in a pairwise contact) - S1 and S2 tags are the strand orientation of the left and right read-end Each pair of contacts produces two lines in the output BAM """ samtools = which(samtools) if not samtools: raise Exception('ERROR: samtools is needed to save a compressed ' 'version of the results. Check ' 'http://samtools.sourceforge.net/ \n') # define filter codes filter_keys = OrderedDict() for k in MASKED: filter_keys[MASKED[k]['name'].replace(' ', '-')] = 2**(k - 1) output = '' # write header output += ("\t".join(("@HD", "VN:1.5", "SO:queryname")) + '\n') # chromosome lengths pos_fh = 0 for chrom in chromosomes: output += ("\t".join( ("@SQ", "SN:" + chrom, "LN:" + str(chromosomes[chrom]))) + '\n') # filter codes for i in filter_keys: output += ("\t".join( ("@CO", "filter:" + i, "flag:" + str(filter_keys[i]))) + '\n') # tags output += ("\t".join(( "@CO", "TC:i", "Number of time a sequenced fragment is involved in a pairwise contact\n" ))) output += ("\t".join( ("@CO", ("Each read is duplicated: once starting with the " "left read-end, once with the right read-end\n")))) output += ("\t".join( ("@CO", (" the order of RE sites and strands changes consequently " "depending on which read-end comes first (" "when right end is first: E3 E4 E1 E2)\n")))) output += ("\t".join( ("@CO", (" CIGAR code contains the length of the " "1st read-end mapped and 'P' or 'S' " "if the copy is the first or the second\n")))) output += ("\t".join( ("@CO", "E1:i", "Position of the left RE site of 1st read-end\n"))) output += ("\t".join( ("@CO", "E2:i", "Position of the right RE site of 1st read-end\n"))) output += ("\t".join( ("@CO", "E3:i", "Position of the left RE site of 2nd read-end\n"))) output += ("\t".join( ("@CO", "E4:i", "Position of the right RE site of 2nd read-end\n"))) output += ("\t".join( ("@CO", "S1:i", "Strand of the 1st read-end (1: positive, 0: negative)\n"))) output += ("\t".join( ("@CO", "S2:i", "Strand of the 2nd read-end (1: positive, 0: negative)\n"))) # check samtools version number and modify command line version = LooseVersion([ l.split()[1] for l in Popen(samtools, stderr=PIPE, universal_newlines=True).communicate()[1].split('\n') if 'Version' in l ][0]) pre = '-o' if version >= LooseVersion('1.3') else '' proc = Popen( samtools + ' view -Shb -@ %d - | samtools sort -@ %d - %s %s' % (ncpus, ncpus, pre, outbam + '.bam' if version >= LooseVersion('1.3') else ''), # in new version '.bam' is no longer added shell=True, stdin=PIPE, universal_newlines=True) proc.stdin.write(output) map2sam = _map2sam_mid rownam = [(k[0], k[1] * reso + 1) for k in sorted(hic.sections, key=lambda x: hic.sections[x])] total_counts = 0 iter_rows = hic.yield_matrix() for nrow, row in enumerate(rownam): line = next(iter_rows) iter_cols = iter(line[nrow:]) for ncol in range(nrow, len(rownam)): col = rownam[ncol] val = int(next(iter_cols)) total_counts += val if not val: continue readid = '%s.%d.%s.%d' % (row[0], nrow, col[0], ncol) for nval in range(val): line_out = '%s.%d\t%s\t%d\t.\t1\t.\t.\t%s\t%d\t.\t1\t.\t.' % ( readid, nval, row[0], row[1], col[0], col[1]) flag = 0 proc.stdin.write(map2sam(line_out, flag)) proc.stdin.close() proc.wait() # Index BAM _ = Popen(samtools + ' index %s.bam' % (outbam), shell=True, universal_newlines=True).communicate() return total_counts
def run(opts): check_options(opts) samtools = which(opts.samtools) launch_time = time.localtime() param_hash = digest_parameters(opts) reso1 = reso2 = None if opts.bam1: mreads1 = path.realpath(opts.bam1) biases1 = opts.biases1 else: biases1, mreads1, reso1 = load_parameters_fromdb( opts.workdir1, opts.jobid1, opts, opts.tmpdb1) mreads1 = path.join(opts.workdir1, mreads1) try: biases1 = path.join(opts.workdir1, biases1) except AttributeError: biases1 = None except TypeError: # Py3 biases1 = None if opts.bam2: mreads2 = path.realpath(opts.bam2) biases2 = opts.biases2 else: biases2, mreads2, reso2 = load_parameters_fromdb( opts.workdir2, opts.jobid2, opts, opts.tmpdb2) mreads2 = path.join(opts.workdir2, mreads2) try: biases2 = path.join(opts.workdir2, biases2) except AttributeError: biases2 = None except TypeError: # Py3 biases1 = None filter_exclude = opts.filter if reso1 != reso2: raise Exception('ERROR: differing resolutions between experiments to ' 'be merged') mkdir(path.join(opts.workdir, '00_merge')) if not opts.skip_comparison: printime(' - loading first sample %s' % (mreads1)) hic_data1 = load_hic_data_from_bam(mreads1, opts.reso, biases=biases1, tmpdir=path.join(opts.workdir, '00_merge'), ncpus=opts.cpus, filter_exclude=filter_exclude) printime(' - loading second sample %s' % (mreads2)) hic_data2 = load_hic_data_from_bam(mreads2, opts.reso, biases=biases2, tmpdir=path.join(opts.workdir, '00_merge'), ncpus=opts.cpus, filter_exclude=filter_exclude) if opts.workdir1 and opts.workdir2: masked1 = {'valid-pairs': {'count': 0}} masked2 = {'valid-pairs': {'count': 0}} else: masked1 = {'valid-pairs': {'count': sum(hic_data1.values())}} masked2 = {'valid-pairs': {'count': sum(hic_data2.values())}} decay_corr_dat = path.join(opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.txt' % (opts.reso, param_hash)) decay_corr_fig = path.join(opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.png' % (opts.reso, param_hash)) eigen_corr_dat = path.join(opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.txt' % (opts.reso, param_hash)) eigen_corr_fig = path.join(opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.png' % (opts.reso, param_hash)) printime(' - comparing experiments') printime(' => correlation between equidistant loci') corr, _, scc, std, bads = correlate_matrices( hic_data1, hic_data2, normalized=opts.norm, remove_bad_columns=True, savefig=decay_corr_fig, savedata=decay_corr_dat, get_bads=True) print(' - correlation score (SCC): %.4f (+- %.7f)' % (scc, std)) printime(' => correlation between eigenvectors') eig_corr = eig_correlate_matrices(hic_data1, hic_data2, normalized=opts.norm, remove_bad_columns=True, nvect=6, savefig=eigen_corr_fig, savedata=eigen_corr_dat) printime(' => reproducibility score') reprod = get_reproducibility(hic_data1, hic_data2, num_evec=20, normalized=opts.norm, verbose=False, remove_bad_columns=True) print(' - reproducibility score: %.4f' % (reprod)) ncols = len(hic_data1) else: ncols = 0 decay_corr_dat = 'None' decay_corr_fig = 'None' eigen_corr_dat = 'None' eigen_corr_fig = 'None' masked1 = {} masked2 = {} corr = eig_corr = scc = std = reprod = 0 bads = {} # merge inputs mkdir(path.join(opts.workdir, '03_filtered_reads')) outbam = path.join(opts.workdir, '03_filtered_reads', 'intersection_%s.bam' % (param_hash)) if not opts.skip_merge: outbam = path.join(opts.workdir, '03_filtered_reads', 'intersection_%s.bam' % (param_hash)) printime(' - Mergeing experiments') system(samtools + ' merge -@ %d %s %s %s' % (opts.cpus, outbam, mreads1, mreads2)) printime(' - Indexing new BAM file') # check samtools version number and modify command line version = LooseVersion([l.split()[1] for l in Popen(samtools, stderr=PIPE, universal_newlines=True).communicate()[1].split('\n') if 'Version' in l][0]) if version >= LooseVersion('1.3.1'): system(samtools + ' index -@ %d %s' % (opts.cpus, outbam)) else: system(samtools + ' index %s' % (outbam)) else: outbam = '' finish_time = time.localtime() save_to_db (opts, mreads1, mreads2, decay_corr_dat, decay_corr_fig, len(list(bads.keys())), ncols, scc, std, reprod, eigen_corr_dat, eigen_corr_fig, outbam, corr, eig_corr, biases1, biases2, masked1, masked2, launch_time, finish_time) printime('\nDone.')
def bed2D_to_BAMhic(infile, valid, ncpus, outbam, frmt, masked=None, samtools='samtools'): """ function adapted from Enrique Vidal <*****@*****.**> scipt to convert 2D beds into compressed BAM format. Gets the *_both_filled_map.tsv contacts from TADbit (and the corresponding filter files) and outputs a modified indexed BAM with the following fields: - read ID - filtering flag (see codes in header) - chromosome ID of the first pair of the contact - genomic position of the first pair of the contact - MAPQ set to 0 - pseudo CIGAR with sequence length and info about current copy (P: first copy, S: second copy) - chromosome ID of the second pair of the contact - genomic position of the second pair of the contact - mapped length of the second pair of the contact - sequence is missing (*) - quality is missing (*) - TC tag indicating single (1) or multi contact (3 6 ... number being the number of times a given sequenced fragment is involved in a pairwise contact) - S1 and S2 tags are the strand orientation of the left and right read-end Each pair of contacts produces two lines in the output BAM """ samtools = which(samtools) if not samtools: raise Exception('ERROR: samtools is needed to save a compressed ' 'version of the results. Check ' 'http://samtools.sourceforge.net/ \n') # define filter codes filter_keys = OrderedDict() for k in MASKED: filter_keys[MASKED[k]['name'].replace(' ', '-')] = 2 ** (k - 1) output = '' # write header output += ("\t".join(("@HD" ,"VN:1.5", "SO:queryname")) + '\n') fhandler = open(infile) line = fhandler.next() # chromosome lengths pos_fh = 0 while line.startswith('#'): (_, _, cr, ln) = line.replace("\t", " ").strip().split(" ") output += ("\t".join(("@SQ", "SN:" + cr, "LN:" + ln)) + '\n') pos_fh += len(line) line = fhandler.next() # filter codes for i in filter_keys: output += ("\t".join(("@CO", "filter:" + i, "flag:" + str(filter_keys[i]))) + '\n') # tags output += ("\t".join(("@CO" ,"TC:i", "Number of time a sequenced fragment is involved in a pairwise contact\n"))) output += ("\t".join(("@CO" ,("Each read is duplicated: once starting with the " "left read-end, once with the right read-end\n")))) output += ("\t".join(("@CO" , (" the order of RE sites and strands changes consequently " "depending on which read-end comes first (" "when right end is first: E3 E4 E1 E2)\n")))) output += ("\t".join(("@CO" ,(" CIGAR code contains the length of the " "1st read-end mapped and 'P' or 'S' " "if the copy is the first or the second\n")))) output += ("\t".join(("@CO" ,"E1:i", "Position of the left RE site of 1st read-end\n"))) output += ("\t".join(("@CO" ,"E2:i", "Position of the right RE site of 1st read-end\n"))) output += ("\t".join(("@CO" ,"E3:i", "Position of the left RE site of 2nd read-end\n"))) output += ("\t".join(("@CO" ,"E4:i", "Position of the right RE site of 2nd read-end\n"))) output += ("\t".join(("@CO" ,"S1:i", "Strand of the 1st read-end (1: positive, 0: negative)\n"))) output += ("\t".join(("@CO" ,"S2:i", "Strand of the 2nd read-end (1: positive, 0: negative)\n"))) # open and init filter files if not valid: filter_line, filter_handler = get_filters(infile, masked) fhandler.seek(pos_fh) # check samtools version number and modify command line version = LooseVersion([l.split()[1] for l in Popen(samtools, stderr=PIPE).communicate()[1].split('\n') if 'Version' in l][0]) pre = '-o' if version >= LooseVersion('1.3') else '' proc = Popen(samtools + ' view -Shb -@ %d - | samtools sort -@ %d - %s %s' % ( ncpus, ncpus, pre, outbam + '.bam' if version >= LooseVersion('1.3') else ''), # in new version '.bam' is no longer added shell=True, stdin=PIPE) proc.stdin.write(output) if frmt == 'mid': map2sam = _map2sam_mid elif frmt == 'long': map2sam = _map2sam_long else: map2sam = _map2sam_short if valid: for line in fhandler: flag = 0 # get output in sam format proc.stdin.write(map2sam(line, flag)) else: for line in fhandler: flag = 0 # check if read matches any filter rid = line.split("\t")[0] for i in filter_line: if filter_line[i] == rid: flag += filter_keys[i] try: filter_line[i] = filter_handler[i].next().strip() except StopIteration: pass # get output in sam format proc.stdin.write(map2sam(line, flag)) proc.stdin.close() proc.wait() # Index BAM _ = Popen(samtools + ' index %s.bam' % (outbam), shell=True).communicate() # close file handlers fhandler.close() if not valid: for i in filter_handler: filter_handler[i].close()