def test_18_filter_reads(self): if ONLY and ONLY != '18': return if CHKTIME: t0 = time() for ali in ['map', 'sam']: seed(1) if 13436 == int(random()*100000): same_seed = True genome = generate_random_ali(ali) genome_bis = parse_fasta('test.fa~', verbose=False) self.assertEqual(genome, genome_bis) else: same_seed = False genome = parse_fasta('test.fa~') # PARSE SAM if ali == 'map': from pytadbit.parsers.map_parser import parse_map as parser else: try: from pytadbit.parsers.sam_parser import parse_sam as parser except ImportError: print 'ERROR: PYSAM not found, skipping test\n' continue parser(['test_read1.%s~' % (ali)], ['test_read2.%s~' % (ali)], './lala1-%s~' % (ali), './lala2-%s~' % (ali), genome, re_name='DPNII', mapper='GEM') # GET INTERSECTION from pytadbit.mapping import get_intersection get_intersection('lala1-%s~' % (ali), 'lala2-%s~' % (ali), 'lala-%s~' % (ali)) # FILTER masked = filter_reads('lala-%s~' % (ali), verbose=False, fast=(ali=='map')) self.assertEqual(masked[1]['reads'], 1000) self.assertEqual(masked[2]['reads'], 1000) self.assertEqual(masked[3]['reads'], 1000) self.assertEqual(masked[4]['reads'], 1000) if same_seed: self.assertEqual(masked[5]['reads'], 1110) self.assertEqual(masked[6]['reads'], 2332) self.assertEqual(masked[7]['reads'], 0) self.assertEqual(masked[8]['reads'], 141) self.assertEqual(masked[10]['reads'], 1) else: self.assertTrue (masked[5]['reads'] > 1000) self.assertEqual(masked[9]['reads'], 1000) apply_filter('lala-map~', 'lala-map-filt~', masked, filters=[1], reverse=True, verbose=False) self.assertEqual(len([True for l in open('lala-map-filt~') if not l.startswith('#')]), 1000) d = plot_iterative_mapping('lala1-map~', 'lala2-map~') self.assertEqual(d[0][1], 6000) if CHKTIME: self.assertEqual(True, True) print '18', time() - t0
def add_sections_from_fasta(self, fasta): """ Add genomic coordinate to HiC_data object by getting them from a fasta file containing chromosome sequences :param fasta: path to a fasta file """ genome = parse_fasta(fasta, verbose=False) sections = [] genome_seq = OrderedDict() size = 0 for crm in genome: genome_seq[crm] = int(len(genome[crm])) / self.resolution + 1 size += genome_seq[crm] section_sizes = {} for crm in genome_seq: len_crm = genome_seq[crm] section_sizes[(crm,)] = len_crm sections.extend([(crm, i) for i in xrange(len_crm)]) dict_sec = dict([(j, i) for i, j in enumerate(sections)]) self.chromosomes = genome_seq self.sections = dict_sec if self.chromosomes: total = 0 for crm in self.chromosomes: self.section_pos[crm] = (total, total + self.chromosomes[crm]) total += self.chromosomes[crm] if size != self.__size: warn('WARNING: different sizes (%d, now:%d), ' % (self.__size, size) + 'should adjust the resolution') self.__size = size self._size2 = size**2
def main(): opts = get_options() iterative_mapping() ## PARSE FASTA genome = parse_fasta(opts.fasta if len(opts.fasta) <= 1 else opts.fasta[0], chr_names=opts.chr_name, verbose=True)
def test_17_map_re_sites(self): """ test fasta parsing and mapping re sites """ ref_genome = parse_fasta(PATH + '/ref_genome/chr2L_chr4_dm3.bz2', verbose=False) self.assertEqual(len(ref_genome['chr4']), 1351857) frags = map_re_sites('dpnIi', ref_genome) self.assertEqual(len(frags['chr2L']), 231) self.assertEqual(len(frags['chr2L'][230]), 16) self.assertEqual(frags['chr4'][10][50], 1018069) frags = map_re_sites('hindiii', ref_genome) self.assertEqual(len(frags['chr2L']), 231) self.assertEqual(len(frags['chr2L'][230]), 3) self.assertEqual(frags['chr4'][10][5], 1017223)
def main(): fastq = '/scratch/db/FASTQs/hsap/dixon_2012/dixon-2012_200bp.fastq' fastq = 'short_dixon-2012_200bp.fastq' # fastq = '/scratch/test/sample_dataset/FASTQs/sample_hsap_HindIII.fastq' gem_index_path = '/scratch/db/index_files/Homo_sapiens-79/Homo_sapiens.gem' out_map_dir1 = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/read1/' out_map_dir2 = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/read2/' temp_dir1 = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/tmp1/' temp_dir2 = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/tmp2/' print 'read 1' outfiles1 = full_mapping(gem_index_path, fastq, out_map_dir1, 'HindIII', temp_dir=temp_dir1, windows=((1,100),), add_site=True) print 'read 2' outfiles2 = full_mapping(gem_index_path, fastq, out_map_dir2, 'HindIII', temp_dir=temp_dir2, windows=((101, 200),), add_site=True) # print 'read 1' # outfiles1 = mapping(gem_index_path, fastq, out_map_dir1, 'HindIII', # temp_dir=temp_dir1, # windows=(zip(*([0] * len(range(25, 105, 5)), # range(25,105,5))))) # print 'read 2' # outfiles2 = mapping(gem_index_path, fastq, out_map_dir2, 'HindIII', # temp_dir=temp_dir2, # windows=(zip(*([100] * len(range(125, 205, 5)), # range(125,205,5))))) print outfiles1 print 'xcmvnkljnv' print outfiles2 from pytadbit.parsers.map_parser import parse_map from pytadbit.parsers.genome_parser import parse_fasta from pytadbit.mapping.mapper import get_intersection from pytadbit.mapping.filter import filter_reads, apply_filter read1, read2 = 'read1.tsv', 'read2.tsv', parse_map(outfiles1, outfiles2, out_file1=read1, out_file2=read2, genome_seq=parse_fasta('/scratch/db/index_files/Homo_sapiens-79/Homo_sapiens.fa'), re_name='HindIII', verbose=True) reads = 'both_reads.tsv' get_intersection(read1, read2, reads) masked = filter_reads(reads) freads = 'filtered_reads.tsv' apply_filter(reads, freads, masked)
def test_17_map_re_sites(self): """ test fasta parsing and mapping re sites """ if ONLY and ONLY != "17": return if CHKTIME: t0 = time() ref_genome = parse_fasta(PATH + "/ref_genome/chr2L_chr4_dm3.bz2", verbose=False) self.assertEqual(len(ref_genome["chr4"]), 1351857) frags = map_re_sites("dpnIi", ref_genome) self.assertEqual(len(frags["chr2L"]), 231) self.assertEqual(len(frags["chr2L"][230]), 16) self.assertEqual(frags["chr4"][10][50], 1018069) frags = map_re_sites("hindiii", ref_genome) self.assertEqual(len(frags["chr2L"]), 231) self.assertEqual(len(frags["chr2L"][230]), 3) self.assertEqual(frags["chr4"][10][5], 1017223) if CHKTIME: self.assertEqual(True, True) print "17", time() - t0
def test_17_map_re_sites(self): """ test fasta parsing and mapping re sites """ if ONLY and ONLY != '17': return if CHKTIME: t0 = time() ref_genome = parse_fasta(PATH + '/ref_genome/chr2L_chr4_dm3.bz2', verbose=False) self.assertEqual(len(ref_genome['chr4']), 1351857) frags = map_re_sites('dpnIi', ref_genome) self.assertEqual(len(frags['chr2L']), 231) self.assertEqual(len(frags['chr2L'][230]), 16) self.assertEqual(frags['chr4'][10][50], 1018069) frags = map_re_sites('hindiii', ref_genome) self.assertEqual(len(frags['chr2L']), 231) self.assertEqual(len(frags['chr2L'][230]), 3) self.assertEqual(frags['chr4'][10][5], 1017223) if CHKTIME: self.assertEqual(True, True) print '17', time() - t0
def run(opts): check_options(opts) launch_time = time.localtime() reads = [1] if opts.read == 1 else [2] if opts.read == 2 else [1, 2] f_names1, f_names2, renz = load_parameters_fromdb(opts.workdir, reads, opts.jobids) name = path.split(opts.workdir)[-1] param_hash = digest_parameters(opts) outdir = '02_parsed_reads' mkdir(path.join(opts.workdir, outdir)) if not opts.read: out_file1 = path.join(opts.workdir, outdir, '%s_r1_%s.tsv' % (name, param_hash)) out_file2 = path.join(opts.workdir, outdir, '%s_r2_%s.tsv' % (name, param_hash)) elif opts.read == 1: out_file1 = path.join(opts.workdir, outdir, '%s_r1_%s.tsv' % (name, param_hash)) out_file2 = None f_names2 = None elif opts.read == 2: out_file2 = None f_names1 = f_names2 f_names2 = None out_file1 = path.join(opts.workdir, outdir, '%s_r2_%s.tsv' % (name, param_hash)) logging.info('parsing genomic sequence') try: # allows the use of cPickle genome to make it faster genome = load(open(opts.genome[0])) except UnpicklingError: genome = parse_fasta(opts.genome) if not opts.skip: logging.info('parsing reads in %s project', name) counts, multis = parse_map(f_names1, f_names2, out_file1=out_file1, out_file2=out_file2, re_name=renz, verbose=True, genome_seq=genome, compress=opts.compress_input) else: counts = {} counts[0] = {} fhandler = open(out_file1) for line in fhandler: if line.startswith('# MAPPED '): _, _, item, value = line.split() counts[0][item] = int(value) elif not line.startswith('#'): break multis = {} multis[0] = 0 for line in fhandler: if '|||' in line: multis[0] += line.count('|||') if out_file2: counts[1] = {} fhandler = open(out_file2) for line in fhandler: if line.startswith('# MAPPED '): _, _, item, value = line.split() counts[1][item] = int(value) elif not line.startswith('#'): break multis[1] = 0 for line in fhandler: if '|||' in line: multis[1] += line.count('|||') # write machine log with open(path.join(opts.workdir, 'trace.log'), "a") as mlog: fcntl.flock(mlog, fcntl.LOCK_EX) for read in counts: for item in counts[read]: mlog.write('# PARSED READ%s PATH\t%d\t%s\n' % ( read, counts[read][item], out_file1 if read == 1 else out_file2)) fcntl.flock(mlog, fcntl.LOCK_UN) finish_time = time.localtime() # save all job information to sqlite DB save_to_db(opts, counts, multis, f_names1, f_names2, out_file1, out_file2, launch_time, finish_time)
def test_18_filter_reads(self): if ONLY and ONLY != "18": return if CHKTIME: t0 = time() for ali in ["map", "sam"]: seed(1) if 13436 == int(random() * 100000): same_seed = True genome = generate_random_ali(ali) genome_bis = parse_fasta("test.fa~", verbose=False) self.assertEqual(genome, genome_bis) else: same_seed = False genome = parse_fasta("test.fa~") # PARSE SAM if ali == "map": from pytadbit.parsers.map_parser import parse_map as parser else: try: from pytadbit.parsers.sam_parser import parse_sam as parser except ImportError: print "ERROR: PYSAM not found, skipping test\n" continue parser( ["test_read1.%s~" % (ali)], ["test_read2.%s~" % (ali)], "./lala1-%s~" % (ali), "./lala2-%s~" % (ali), genome, re_name="DPNII", mapper="GEM", ) # GET INTERSECTION from pytadbit.mapping import get_intersection get_intersection("lala1-%s~" % (ali), "lala2-%s~" % (ali), "lala-%s~" % (ali)) # FILTER masked = filter_reads("lala-%s~" % (ali), verbose=False, fast=(ali == "map")) self.assertEqual(masked[1]["reads"], 1000) self.assertEqual(masked[2]["reads"], 1000) self.assertEqual(masked[3]["reads"], 1000) self.assertEqual(masked[4]["reads"], 1000) if same_seed: self.assertEqual(masked[5]["reads"], 1110) self.assertEqual(masked[6]["reads"], 2332) self.assertEqual(masked[7]["reads"], 0) self.assertEqual(masked[8]["reads"], 141) self.assertEqual(masked[10]["reads"], 1) else: self.assertTrue(masked[5]["reads"] > 1000) self.assertEqual(masked[9]["reads"], 1000) apply_filter("lala-map~", "lala-map-filt~", masked, filters=[1], reverse=True, verbose=False) self.assertEqual(len([True for l in open("lala-map-filt~") if not l.startswith("#")]), 1000) d = plot_iterative_mapping("lala1-map~", "lala2-map~") self.assertEqual(d[0][1], 6000) if CHKTIME: self.assertEqual(True, True) print "18", time() - t0
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts, get_md5=True) if opts.nosql: biases = opts.biases mreads = opts.mreads inputs = [] elif opts.biases or opts.mreads: if not opts.mreads: raise Exception('ERROR: also need to provide BAM file') if not opts.biases: raise Exception('ERROR: also need to provide biases file') biases = opts.biases mreads = opts.mreads inputs = ['NA', 'NA'] mkdir(path.join(opts.workdir)) else: biases, mreads, biases_id, mreads_id = load_parameters_fromdb(opts) inputs = [biases_id, mreads_id] # store path ids to be saved in database mreads = path.join(opts.workdir, mreads) biases = path.join(opts.workdir, biases) reso = opts.reso mkdir(path.join(opts.workdir, '06_segmentation')) print 'loading %s \n at resolution %s' % (mreads, nice(reso)) region = None if opts.crms and len(opts.crms) == 1: region = opts.crms[0] hic_data = load_hic_data_from_bam(mreads, reso, ncpus=opts.cpus, region=region, biases=None if opts.all_bins else biases, filter_exclude=opts.filter) # compartments cmp_result = {} richA_stats = {} firsts = {} if not opts.only_tads: print 'Searching compartments' cmprt_dir = path.join(opts.workdir, '06_segmentation', 'compartments_%s' % (nice(reso))) mkdir(cmprt_dir) if opts.fasta: print ' - Computing GC content to label compartments' rich_in_A = get_gc_content(parse_fasta(opts.fasta, chr_filter=opts.crms), reso, chromosomes=opts.crms, by_chrom=True, n_cpus=opts.cpus) elif opts.rich_in_A: rich_in_A = opts.rich_in_A else: rich_in_A = None n_evs = opts.n_evs if opts.n_evs > 0 else 3 firsts, richA_stats = hic_data.find_compartments( crms=opts.crms, savefig=cmprt_dir, verbose=True, suffix=param_hash, rich_in_A=rich_in_A, show_compartment_labels=rich_in_A is not None, savecorr=cmprt_dir if opts.savecorr else None, max_ev=n_evs, ev_index=opts.ev_index, vmin=None if opts.fix_corr_scale else 'auto', vmax=None if opts.fix_corr_scale else 'auto') for ncrm, crm in enumerate(opts.crms or hic_data.chromosomes): if not crm in firsts: continue ev_file = open(path.join( cmprt_dir, '%s_EigVect%d_%s.tsv' % ( crm, opts.ev_index[ncrm] if opts.ev_index else 1, param_hash)), 'w') ev_file.write('# %s\n' % ('\t'.join( 'EV_%d (%.4f)' % (i, v) for i, v in enumerate(firsts[crm][0], 1)))) ev_file.write('\n'.join(['\t'.join([str(v) for v in vs]) for vs in zip(*firsts[crm][1])])) ev_file.close() for ncrm, crm in enumerate(opts.crms or hic_data.chromosomes): cmprt_file1 = path.join(cmprt_dir, '%s_%s.tsv' % (crm, param_hash)) cmprt_file2 = path.join(cmprt_dir, '%s_EigVect%d_%s.tsv' % ( crm, opts.ev_index[ncrm] if opts.ev_index else 1, param_hash)) cmprt_image = path.join(cmprt_dir, '%s_EV%d_%s.%s' % ( crm, opts.ev_index[ncrm] if opts.ev_index else 1, param_hash, opts.format)) if opts.savecorr: cormat_file = path.join(cmprt_dir, '%s_corr-matrix%s.tsv' % (crm, param_hash)) else: cormat_file = None hic_data.write_compartments(cmprt_file1, chroms=[crm]) cmp_result[crm] = {'path_cmprt1': cmprt_file1, 'path_cmprt2': cmprt_file2, 'path_cormat': cormat_file, 'image_cmprt': cmprt_image, 'num' : len(hic_data.compartments[crm])} # TADs tad_result = {} if not opts.only_compartments: print 'Searching TADs' tad_dir = path.join(opts.workdir, '06_segmentation', 'tads_%s' % (nice(reso))) mkdir(tad_dir) for crm in hic_data.chromosomes: if opts.crms and not crm in opts.crms: continue print ' - %s' % crm matrix = hic_data.get_matrix(focus=crm) beg, end = hic_data.section_pos[crm] size = len(matrix) if size < 10: print " Chromosome too short (%d bins), skipping..." % size continue # transform bad column in chromosome referential if hic_data.bads: to_rm = tuple([1 if i in hic_data.bads else 0 for i in xrange(beg, end)]) else: to_rm = None # maximum size of a TAD max_tad_size = (size - 1) if opts.max_tad_size is None else opts.max_tad_size result = tadbit([matrix], remove=to_rm, n_cpus=opts.cpus, verbose=opts.verbose, max_tad_size=max_tad_size, no_heuristic=False) # use normalization to compute height on TADs called if opts.all_bins: if opts.nosql: biases = load(open(biases)) else: biases = load(open(path.join(opts.workdir, biases))) hic_data.bads = biases['badcol'] hic_data.bias = biases['biases'] tads = load_tad_height(result, size, beg, end, hic_data) table = '' table += '%s\t%s\t%s\t%s\t%s\n' % ('#', 'start', 'end', 'score', 'density') for tad in tads: table += '%s\t%s\t%s\t%s%s\n' % ( tad, int(tads[tad]['start'] + 1), int(tads[tad]['end'] + 1), abs(tads[tad]['score']), '\t%s' % (round( float(tads[tad]['height']), 3))) out_tad = path.join(tad_dir, '%s_%s.tsv' % (crm, param_hash)) out = open(out_tad, 'w') out.write(table) out.close() tad_result[crm] = {'path' : out_tad, 'num': len(tads)} finish_time = time.localtime() if not opts.nosql: try: save_to_db(opts, cmp_result, tad_result, reso, inputs, richA_stats, firsts, param_hash, launch_time, finish_time) except: # release lock anyway print_exc() try: remove(path.join(opts.workdir, '__lock_db')) except OSError: pass exit(1)
def make_matrices(left_reads_fastq, right_reads_fastq, reads_fastq, genome_fasta, genome_index, \ output_directory, output_prefix, enzyme, res, chromosomes, threads_number, \ clean_tmp, tmp_dir): print 'Begin to process reads.' left_reads = '' right_reads = '' if reads_fastq != '': # left and right reads are stored in one file range_start_left, range_stop_left, \ range_start_right, range_stop_right = calc_left_right_ranges(reads_fastq) print 'Reads: ', reads_fastq left_reads = reads_fastq right_reads = reads_fastq else: # left and right reads are stored separately range_start_left, range_stop_left, \ range_start_right, range_stop_right = calc_range(left_reads_fastq) print 'Left reads: ', left_reads_fastq print 'Right reads: ', right_reads_fastq print 'Output prefix: ', output_prefix left_reads = left_reads_fastq right_reads = right_reads_fastq print 'Reference genome FASTA: ', genome_fasta print 'Reference genome GEM index:', genome_index print 'Output directory: ', output_directory print 'Temp directory: ', tmp_dir print 'Enzyme: ', enzyme print 'Resolution: ', res, 'bp' print 'Number of threads: ', threads_number print 'Start pos for left reads: ', range_start_left print 'Stop pos for left reads: ', range_stop_left print 'Start pos for right reads: ', range_start_right print 'Stop pos for right reads: ', range_stop_right stdout.flush() # map left reads to reference genome out_sam_left_name = splitext(basename(left_reads))[0] + '_left.sam' out_sam_left_path = join(output_directory, out_sam_left_name) print 'Iterative mapping of left reads (using ' + str(threads_number) + ' threads)...' stdout.flush() sams_left = iterative_mapping(genome_index, left_reads, out_sam_left_path, \ range_start_left, range_stop_left, nthreads=threads_number, temp_dir=tmp_dir) print 'Done.' stdout.flush() # map right reads to reference genome out_sam_right_name = splitext(basename(right_reads))[0] + '_right.sam' out_sam_right_path = join(output_directory, out_sam_right_name) print 'Iterative mapping of right reads (using ' + str(threads_number) + ' threads)...' stdout.flush() sams_right = iterative_mapping(genome_index, right_reads, out_sam_right_path, \ range_start_right, range_stop_right, nthreads=threads_number, temp_dir=tmp_dir) print 'Done.' stdout.flush() # load reference genome sequence print 'Load reference genome sequence...' stdout.flush() chroms = chromosomes[:] genome_seq = parse_fasta(genome_fasta, chr_names=chroms) print 'Done.' stdout.flush() # create files with information about every left and right read # and about their placement with respect to restriction sites tsv_left_name = splitext(basename(left_reads))[0] + '_left.tsv' tsv_left = join(output_directory, tsv_left_name) tsv_right_name = splitext(basename(right_reads))[0] + '_right.tsv' tsv_right = join(output_directory, tsv_right_name) print 'Get information about restriction sites and reads placement...' stdout.flush() parse_sam(sams_left, sams_right, tsv_left, tsv_right, genome_seq, enzyme, \ verbose=True, ncpus=8) print 'Done.' stdout.flush() # create file with both left and right reads that uniquelly mapped to reference genome if reads_fastq != '': # left and right reads are stored in one file common_reads_prefix = splitext(basename(reads_fastq))[0] else: # left and right reads are stored separately common_reads_prefix = output_prefix uniq_reads_name = common_reads_prefix + '_both_map_uniq.tsv' uniq_reads = join(output_directory, uniq_reads_name) print 'Merge info about left and right reads in one file...' stdout.flush() get_intersection(tsv_left, tsv_right, uniq_reads, verbose=True) print 'Done.' stdout.flush() # find read IDs that are filtered by default TADbit filters print 'Mask reads...' stdout.flush() # debug print "uniq_reads =", uniq_reads masked = filter_reads(uniq_reads) print 'Done.' stdout.flush() # apply all filters (exclude reads that were filtered) print 'Filter masked reads...' stdout.flush() filtered_reads_name = common_reads_prefix + '_filtered.tsv' filtered_reads = join(output_directory, filtered_reads_name) apply_filter(uniq_reads, filtered_reads, masked) print 'Done.' stdout.flush() # create matrices (one matrix per chromosome) print 'Create Hi-C maps (one per chromosome)...' stdout.flush() hic_map(filtered_reads, resolution=res, by_chrom='intra', savedata=output_directory) print 'Done.' stdout.flush() print 'Add resolution (' + str(resolution) + ') to matrix filenames...' stdout.flush() add_resolution(chromosomes, resolution, output_directory) print 'Done.' stdout.flush() print 'Add headers to matrix files...' stdout.flush() add_headers(chromosomes, resolution, output_directory) print 'Done.' stdout.flush() if clean_tmp: # Remove all SAM and TSV files from the output directory print 'Remove SAM and TSV files from the output directory.' stdout.flush() map(os.remove, glob.glob(out_sam_left_path + '*')) map(os.remove, glob.glob(out_sam_right_path + '*')) map(os.remove, glob.glob(join(output_directory, '*.tsv'))) print 'Done.' stdout.flush()
def run(opts): check_options(opts) launch_time = time.localtime() reads = [1] if opts.read == 1 else [2] if opts.read == 2 else [1, 2] f_names1, f_names2, renz = load_parameters_fromdb(opts, reads, opts.jobids) renz = renz.split('-') opts.workdir = path.abspath(opts.workdir) name = path.split(opts.workdir)[-1] param_hash = digest_parameters(opts) outdir = '02_parsed_reads' mkdir(path.join(opts.workdir, outdir)) if not opts.read: out_file1 = path.join(opts.workdir, outdir, '%s_r1_%s.tsv' % (name, param_hash)) out_file2 = path.join(opts.workdir, outdir, '%s_r2_%s.tsv' % (name, param_hash)) elif opts.read == 1: out_file1 = path.join(opts.workdir, outdir, '%s_r1_%s.tsv' % (name, param_hash)) out_file2 = None f_names2 = None elif opts.read == 2: out_file2 = None f_names1 = f_names2 f_names2 = None out_file1 = path.join(opts.workdir, outdir, '%s_r2_%s.tsv' % (name, param_hash)) logging.info('parsing genomic sequence') try: # allows the use of cPickle genome to make it faster genome = load(open(opts.genome[0])) except UnpicklingError: genome = parse_fasta(opts.genome, chr_regexp=opts.filter_chrom) if not opts.skip: logging.info('parsing reads in %s project', name) counts, multis = parse_map(f_names1, f_names2, out_file1=out_file1, out_file2=out_file2, re_name=renz, verbose=True, genome_seq=genome, compress=opts.compress_input) else: counts = {} counts[0] = {} fhandler = open(out_file1) for line in fhandler: if line.startswith('# MAPPED '): _, _, item, value = line.split() counts[0][item] = int(value) elif not line.startswith('#'): break multis = {} multis[0] = {} for line in fhandler: if '|||' in line: try: multis[0][line.count('|||')] += 1 except KeyError: multis[0][line.count('|||')] = 1 if out_file2: counts[1] = {} fhandler = open(out_file2) for line in fhandler: if line.startswith('# MAPPED '): _, _, item, value = line.split() counts[1][item] = int(value) elif not line.startswith('#'): break multis[1] = 0 for line in fhandler: if '|||' in line: multis[1] += line.count('|||') # write machine log while path.exists(path.join(opts.workdir, '__lock_log')): time.sleep(0.5) open(path.join(opts.workdir, '__lock_log'), 'a').close() with open(path.join(opts.workdir, 'trace.log'), "a") as mlog: for read in counts: for item in counts[read]: mlog.write('# PARSED READ%s PATH\t%d\t%s\n' % ( read, counts[read][item], out_file1 if read == 1 else out_file2)) # release lock try: remove(path.join(opts.workdir, '__lock_log')) except OSError: pass finish_time = time.localtime() # save all job information to sqlite DB save_to_db(opts, counts, multis, f_names1, f_names2, out_file1, out_file2, launch_time, finish_time)
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts) if opts.bam: mreads = path.realpath(opts.bam) else: mreads = path.join(opts.workdir, load_parameters_fromdb(opts)) filter_exclude = opts.filter outdir = path.join(opts.workdir, '04_normalization') mkdir(outdir) mappability = gc_content = n_rsites = None if opts.normalization == 'oneD': if not opts.fasta: raise Exception('ERROR: missing path to FASTA for oneD normalization') if not opts.renz: raise Exception('ERROR: missing restriction enzyme name for oneD normalization') if not opts.mappability: raise Exception('ERROR: missing path to mappability for oneD normalization') bamfile = AlignmentFile(mreads, 'rb') refs = bamfile.references bamfile.close() # get genome sequence ~1 min printime(' - parsing FASTA') genome = parse_fasta(opts.fasta, verbose=False) fas = set(genome.keys()) bam = set(refs) if fas - bam: print 'WARNING: %d extra chromosomes in FASTA (removing them)' % (len(fas - bam)) if len(fas - bam) <= 50: print '\n'.join([(' - ' + c) for c in (fas - bam)]) if bam - fas: txt = ('\n'.join([(' - ' + c) for c in (bam - fas)]) if len(bam - fas) <= 50 else '') raise Exception('ERROR: %d extra chromosomes in BAM (remove them):\n%s\n' % ( len(bam - fas), txt)) refs = [crm for crm in refs if crm in genome] if len(refs) == 0: raise Exception("ERROR: chromosomes in FASTA different the ones" " in BAM") # get mappability ~2 min printime(' - Parsing mappability') mappability = parse_mappability_bedGraph( opts.mappability, opts.reso, wanted_chrom=refs[0] if len(refs)==1 else None) # resize chomosomes for c in refs: if not c in mappability: mappability[c] = [float('nan')] * (len(refs) / opts.reso + 1) if len(mappability[c]) < len(refs) / opts.reso + 1: mappability[c] += [float('nan')] * ( (len(refs) / opts.reso + 1) - len(mappability[c])) # concatenates mappability = reduce(lambda x, y: x + y, (mappability.get(c, []) for c in refs)) printime(' - Computing GC content per bin (removing Ns)') gc_content = get_gc_content(genome, opts.reso, chromosomes=refs, n_cpus=opts.cpus) # compute r_sites ~30 sec # TODO: read from DB printime(' - Computing number of RE sites per bin (+/- 200 bp)') n_rsites = [] re_site = RESTRICTION_ENZYMES[opts.renz].replace('|', '') for crm in refs: for pos in xrange(200, len(genome[crm]) + 200, opts.reso): seq = genome[crm][pos-200:pos + opts.reso + 200] n_rsites.append(seq.count(re_site)) ## CHECK TO BE REMOVED # out = open('tmp_mappability.txt', 'w') # i = 0 # for crm in refs: # for pos in xrange(len(genome[crm]) / opts.reso + 1): # out.write('%s\t%d\t%d\t%f\n' % (crm, pos * opts.reso, pos * opts.reso + opts.reso, mappability[i])) # i += 1 # out.close() # compute GC content ~30 sec # TODO: read from DB biases, decay, badcol, raw_cisprc, norm_cisprc = read_bam( mreads, filter_exclude, opts.reso, min_count=opts.min_count, sigma=2, factor=1, outdir=outdir, extra_out=param_hash, ncpus=opts.cpus, normalization=opts.normalization, mappability=mappability, p_fit=opts.p_fit, cg_content=gc_content, n_rsites=n_rsites, min_perc=opts.min_perc, max_perc=opts.max_perc, seed=opts.seed, normalize_only=opts.normalize_only, max_njobs=opts.max_njobs, extra_bads=opts.badcols, biases_path=opts.biases_path) bad_col_image = path.join(outdir, 'filtered_bins_%s_%s.png' % ( nicer(opts.reso).replace(' ', ''), param_hash)) inter_vs_gcoord = path.join(opts.workdir, '04_normalization', 'interactions_vs_genomic-coords.png_%s_%s.png' % ( opts.reso, param_hash)) # get and plot decay if not opts.normalize_only: printime(' - Computing interaction decay vs genomic distance') (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions( decay, max_diff=10000, resolution=opts.reso, normalized=not opts.filter_only, savefig=inter_vs_gcoord) print (' -> Decay slope 0.7-10 Mb\t%s' % a2) else: a2 = 0. printime(' - Saving biases and badcol columns') # biases bias_file = path.join(outdir, 'biases_%s_%s.pickle' % ( nicer(opts.reso).replace(' ', ''), param_hash)) out = open(bias_file, 'w') dump({'biases' : biases, 'decay' : decay, 'badcol' : badcol, 'resolution': opts.reso}, out, HIGHEST_PROTOCOL) out.close() finish_time = time.localtime() try: save_to_db(opts, bias_file, mreads, bad_col_image, len(badcol), len(biases), raw_cisprc, norm_cisprc, inter_vs_gcoord, a2, opts.filter, launch_time, finish_time) except: # release lock anyway print_exc() try: remove(path.join(opts.workdir, '__lock_db')) except OSError: pass exit(1)
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts) if opts.bam: mreads = path.realpath(opts.bam) else: mreads = path.join(opts.workdir, load_parameters_fromdb(opts)) filter_exclude = opts.filter outdir = path.join(opts.workdir, '04_normalization') mkdir(outdir) mappability = gc_content = n_rsites = None if opts.normalization == 'oneD': if not opts.fasta: raise Exception( 'ERROR: missing path to FASTA for oneD normalization') if not opts.renz: raise Exception( 'ERROR: missing restriction enzyme name for oneD normalization' ) if not opts.mappability: raise Exception( 'ERROR: missing path to mappability for oneD normalization') bamfile = AlignmentFile(mreads, 'rb') refs = bamfile.references bamfile.close() # get genome sequence ~1 min printime(' - parsing FASTA') genome = parse_fasta(opts.fasta, verbose=False) fas = set(genome.keys()) bam = set(refs) if fas - bam: print('WARNING: %d extra chromosomes in FASTA (removing them)' % (len(fas - bam))) if len(fas - bam) <= 50: print('\n'.join([(' - ' + c) for c in (fas - bam)])) if bam - fas: txt = ('\n'.join([(' - ' + c) for c in (bam - fas)]) if len(bam - fas) <= 50 else '') raise Exception( 'ERROR: %d extra chromosomes in BAM (remove them):\n%s\n' % (len(bam - fas), txt)) refs = [crm for crm in refs if crm in genome] if len(refs) == 0: raise Exception("ERROR: chromosomes in FASTA different the ones" " in BAM") # get mappability ~2 min printime(' - Parsing mappability') mappability = parse_mappability_bedGraph( opts.mappability, opts.reso, wanted_chrom=refs[0] if len(refs) == 1 else None) # resize chomosomes for c in refs: if not c in mappability: mappability[c] = [float('nan')] * (len(refs) // opts.reso + 1) if len(mappability[c]) < len(refs) // opts.reso + 1: mappability[c] += [float('nan')] * ( (len(refs) // opts.reso + 1) - len(mappability[c])) # concatenates mappability = reduce(lambda x, y: x + y, (mappability.get(c, []) for c in refs)) printime(' - Computing GC content per bin (removing Ns)') gc_content = get_gc_content(genome, opts.reso, chromosomes=refs, n_cpus=opts.cpus) # compute r_sites ~30 sec # TODO: read from DB printime(' - Computing number of RE sites per bin (+/- 200 bp)') n_rsites = [] re_site = RESTRICTION_ENZYMES[opts.renz].replace('|', '') for crm in refs: for pos in range(200, len(genome[crm]) + 200, opts.reso): seq = genome[crm][pos - 200:pos + opts.reso + 200] n_rsites.append(seq.count(re_site)) ## CHECK TO BE REMOVED # out = open('tmp_mappability.txt', 'w') # i = 0 # for crm in refs: # for pos in xrange(len(genome[crm]) / opts.reso + 1): # out.write('%s\t%d\t%d\t%f\n' % (crm, pos * opts.reso, pos * opts.reso + opts.reso, mappability[i])) # i += 1 # out.close() # compute GC content ~30 sec # TODO: read from DB biases, decay, badcol, raw_cisprc, norm_cisprc = read_bam( mreads, filter_exclude, opts.reso, min_count=opts.min_count, sigma=2, factor=1, outdir=outdir, extra_out=param_hash, ncpus=opts.cpus, normalization=opts.normalization, mappability=mappability, p_fit=opts.p_fit, cg_content=gc_content, n_rsites=n_rsites, min_perc=opts.min_perc, max_perc=opts.max_perc, seed=opts.seed, normalize_only=opts.normalize_only, max_njobs=opts.max_njobs, extra_bads=opts.badcols, biases_path=opts.biases_path) inter_vs_gcoord = path.join( opts.workdir, '04_normalization', 'interactions_vs_genomic-coords.png_%s_%s.png' % (opts.reso, param_hash)) # get and plot decay if not opts.normalize_only: printime(' - Computing interaction decay vs genomic distance') (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions( decay, max_diff=10000, resolution=opts.reso, normalized=not opts.filter_only, savefig=inter_vs_gcoord) print(' -> Decay slope 0.7-10 Mb\t%s' % a2) else: a2 = 0. printime(' - Saving biases and badcol columns') # biases bias_file = path.join( outdir, 'biases_%s_%s.pickle' % (nicer(opts.reso).replace(' ', ''), param_hash)) out = open(bias_file, 'wb') dump( { 'biases': biases, 'decay': decay, 'badcol': badcol, 'resolution': opts.reso }, out, HIGHEST_PROTOCOL) out.close() finish_time = time.localtime() try: save_to_db(opts, bias_file, mreads, len(badcol), len(biases), raw_cisprc, norm_cisprc, inter_vs_gcoord, a2, opts.filter, launch_time, finish_time) except: # release lock anyway print_exc() try: remove(path.join(opts.workdir, '__lock_db')) except OSError: pass exit(1)
genome = {} for crm in xrange(1, num_crms + 1): crm_len = int(mean_crm_size * random()) genome['chr' + str(crm)] = ''.join([nts[int(401 * random())] for _ in xrange(crm_len)]) out = open('test.fa~', 'w') for crm in xrange(1, num_crms + 1): out.write('>chr%d\n' % crm) crm = 'chr' + str(crm) for p in xrange(0, len(genome[crm]), 60): out.write(genome[crm][p:p+60] + '\n') out.close() genome_bis = parse_fasta('test.fa~') if genome_bis == genome: genome = genome_bis else: raise Exception('problem with genome parser') # RE FRAGMENTS frags = {} for crm in genome: frags[crm] = {} beg = 0 for pos in re.finditer(re_seq, genome[crm]): end = pos.start() + 1 + enz_cut if beg == end: continue
from pytadbit.parsers.genome_parser import parse_fasta from pytadbit.mapping.mapper import get_intersection chunk = 10000000 chunk = int(sys.argv[1]) PATH = '/home/fransua/Documents/Courses/given/2014_CSDM/notebooks/' INFILE = '/home/fransua/Documents/Courses/given/2014_CSDM/notebooks/fastq/%s.fastq' rep = 'SRR_test' INFILE = INFILE % rep OUTPATH = PATH + rep + '_' + str(chunk) + '/' chr_names = ['2L', '2R', '3L', '3R', '4', 'X'] genome_seq = parse_fasta([PATH + 'dmel_reference/chr%s.fa' % crm for crm in chr_names], chr_names) frags = map_re_sites('HindIII', genome_seq, verbose=True) sams1 = iterative_mapping( gem_index_path = PATH + 'dmel_reference/dm3.genome.gem', fastq_path = INFILE, out_sam_path = OUTPATH + '%s_r1.txt' % rep, temp_dir = PATH + 'tmp_dir/', range_start = [10] * 5, # starts with a flag sequence range_stop = range(30, 55, 5), nthreads = 8, # on intel corei7 CPUs 4 threads are as fast as # 8, but leave some room for you other applications max_reads_per_chunk = chunk, single_end = True) print 'created thes SAM files:', sams1
def main(): fastq = '/scratch/db/FASTQs/hsap/dixon_2012/dixon-2012_200bp.fastq' fastq = 'short_dixon-2012_200bp.fastq' # fastq = '/scratch/test/sample_dataset/FASTQs/sample_hsap_HindIII.fastq' gem_index_path = '/scratch/db/index_files/Homo_sapiens-79/Homo_sapiens.gem' out_map_dir1 = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/read1/' out_map_dir2 = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/read2/' temp_dir1 = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/tmp1/' temp_dir2 = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/tmp2/' print 'read 1' outfiles1 = full_mapping(gem_index_path, fastq, out_map_dir1, 'HindIII', temp_dir=temp_dir1, windows=((1, 100), ), add_site=True) print 'read 2' outfiles2 = full_mapping(gem_index_path, fastq, out_map_dir2, 'HindIII', temp_dir=temp_dir2, windows=((101, 200), ), add_site=True) # print 'read 1' # outfiles1 = mapping(gem_index_path, fastq, out_map_dir1, 'HindIII', # temp_dir=temp_dir1, # windows=(zip(*([0] * len(range(25, 105, 5)), # range(25,105,5))))) # print 'read 2' # outfiles2 = mapping(gem_index_path, fastq, out_map_dir2, 'HindIII', # temp_dir=temp_dir2, # windows=(zip(*([100] * len(range(125, 205, 5)), # range(125,205,5))))) print outfiles1 print 'xcmvnkljnv' print outfiles2 from pytadbit.parsers.map_parser import parse_map from pytadbit.parsers.genome_parser import parse_fasta from pytadbit.mapping.mapper import get_intersection from pytadbit.mapping.filter import filter_reads, apply_filter read1, read2 = 'read1.tsv', 'read2.tsv', parse_map(outfiles1, outfiles2, out_file1=read1, out_file2=read2, genome_seq=parse_fasta( '/scratch/db/index_files/Homo_sapiens-79/Homo_sapiens.fa'), re_name='HindIII', verbose=True) reads = 'both_reads.tsv' get_intersection(read1, read2, reads) masked = filter_reads(reads) freads = 'filtered_reads.tsv' apply_filter(reads, freads, masked)
for crm in xrange(1, num_crms + 1): crm_len = int(mean_crm_size * random()) genome['chr' + str(crm)] = ''.join([nts[int(401 * random())] for _ in xrange(crm_len)]) out = open('test.fa~', 'w') for crm in xrange(1, num_crms + 1): out.write('>chr%d\n' % crm) crm = 'chr' + str(crm) for p in xrange(0, len(genome[crm]), 60): out.write(genome[crm][p:p+60] + '\n') out.close() from pytadbit.parsers.genome_parser import parse_fasta genome_bis = parse_fasta('test.fa~') if genome_bis == genome: genome = genome_bis else: raise Exception('problem with genome parser') # RE FRAGMENTS frags = {} for crm in genome: frags[crm] = {} beg = 0 for pos in re.finditer(re_seq, genome[crm]): end = pos.start() + 1 + enz_cut if beg == end: continue
def run(opts): check_options(opts) launch_time = time.localtime() reads = [1] if opts.read == 1 else [2] if opts.read == 2 else [1, 2] f_names1, f_names2, renz = load_parameters_fromdb(opts, reads, opts.jobids) renz = renz.split('-') opts.workdir = path.abspath(opts.workdir) name = path.split(opts.workdir)[-1] param_hash = digest_parameters(opts) outdir = '02_parsed_reads' mkdir(path.join(opts.workdir, outdir)) if not opts.read: out_file1 = path.join(opts.workdir, outdir, '%s_r1_%s.tsv' % (name, param_hash)) out_file2 = path.join(opts.workdir, outdir, '%s_r2_%s.tsv' % (name, param_hash)) elif opts.read == 1: out_file1 = path.join(opts.workdir, outdir, '%s_r1_%s.tsv' % (name, param_hash)) out_file2 = None f_names2 = None elif opts.read == 2: out_file2 = None f_names1 = f_names2 f_names2 = None out_file1 = path.join(opts.workdir, outdir, '%s_r2_%s.tsv' % (name, param_hash)) logging.info('parsing genomic sequence') try: # allows the use of pickle genome to make it faster genome = load(open(opts.genome[0],'rb')) except (UnpicklingError, KeyError): genome = parse_fasta(opts.genome, chr_regexp=opts.filter_chrom) if not opts.skip: logging.info('parsing reads in %s project', name) counts, multis = parse_map(f_names1, f_names2, out_file1=out_file1, out_file2=out_file2, re_name=renz, verbose=True, genome_seq=genome, compress=opts.compress_input) else: counts = {} counts[0] = {} fhandler = open(out_file1) for line in fhandler: if line.startswith('# MAPPED '): _, _, item, value = line.split() counts[0][item] = int(value) elif not line.startswith('#'): break multis = {} multis[0] = {} for line in fhandler: if '|||' in line: try: multis[0][line.count('|||')] += 1 except KeyError: multis[0][line.count('|||')] = 1 if out_file2: counts[1] = {} fhandler = open(out_file2) for line in fhandler: if line.startswith('# MAPPED '): _, _, item, value = line.split() counts[1][item] = int(value) elif not line.startswith('#'): break multis[1] = 0 for line in fhandler: if '|||' in line: multis[1] += line.count('|||') # write machine log while path.exists(path.join(opts.workdir, '__lock_log')): time.sleep(0.5) open(path.join(opts.workdir, '__lock_log'), 'a').close() with open(path.join(opts.workdir, 'trace.log'), "a") as mlog: for read in counts: for item in counts[read]: mlog.write('# PARSED READ%s PATH\t%d\t%s\n' % ( read, counts[read][item], out_file1 if read == 1 else out_file2)) # release lock try: remove(path.join(opts.workdir, '__lock_log')) except OSError: pass finish_time = time.localtime() # save all job information to sqlite DB save_to_db(opts, counts, multis, f_names1, f_names2, out_file1, out_file2, launch_time, finish_time)
outfiles1 = full_mapping(gem_index_path, fastq, out_map_dir1, 'HindIII', temp_dir=temp_dir1, frag_map=False, windows=(zip(*(r_beg1, r_end1)))) print 'read 2' outfiles2 = full_mapping(gem_index_path, fastq, out_map_dir2, 'HindIII', temp_dir=temp_dir2, frag_map=False, windows=(zip(*(r_beg2, r_end2)))) parse_thing = parse_map elif mapper == 3: print 'read 1' outfiles1 = full_mapping(gem_index_path, fastq, out_map_dir1, 'HindIII', temp_dir=temp_dir1, windows=(zip(*(r_beg1, r_end1)))) print 'read 2' outfiles2 = full_mapping(gem_index_path, fastq, out_map_dir2, 'HindIII', temp_dir=temp_dir2, windows=(zip(*(r_beg2, r_end2)))) parse_thing = parse_map read1, read2 = 'read1.tsv_%s-%s' % (mapper, win), 'read2.tsv_%s-%s' % (mapper, win) parse_thing(outfiles1, outfiles2, out_file1=read1, out_file2=read2, genome_seq=parse_fasta('/scratch/db/index_files/Homo_sapiens-79/Homo_sapiens.fa'), re_name='HindIII', verbose=True) reads = 'both_reads.tsv_%s-%s' % (mapper, win) get_intersection(read1, read2, reads) masked = filter_reads(reads) freads = 'filtered_reads.tsv_%s-%s' % (mapper, win) apply_filter(reads, freads, masked)
def run(self, input_files, output_files, metadata=None): # pylint: disable=too-many-locals,arguments-differ,inconsistent-return-statements """ The main function to map the aligned reads and return the matching pairs. Parsing of the mappings can be either iterative of fragment based. If it is to be iteractive then the locations of 4 output file windows for each end of the paired end window need to be provided. If it is fragment based, then only 2 window locations need to be provided along within an enzyme name. Parameters ---------- input_files : list genome_file : str Location of the genome FASTA file window1_1 : str Location of the first window index file window1_2 : str Location of the second window index file window1_3 : str [OPTIONAL] Location of the third window index file window1_4 : str [OPTIONAL] Location of the fourth window index file window2_1 : str Location of the first window index file window2_2 : str Location of the second window index file window2_3 : str [OPTIONAL] Location of the third window index file window2_4 : str [OPTIONAL] Location of the fourth window index file metadata : dict windows : list List of lists with the window sizes to be computed enzyme_name : str Restricture enzyme name mapping : list The mapping function used. The options are iter or frag. Returns ------- output_files : list List of locations for the output files. output_metadata : dict Dict of matching metadata dict objects Example ------- Iterative: .. code-block:: python from tool import tb_parse_mapping genome_file = 'genome.fasta' root_name_1 = "/tmp/data/expt_source_1".split root_name_2 = "/tmp/data/expt_source_2".split windows = [[1,25], [1,50], [1,75], [1,100]] windows1 = [] windows2 = [] for w in windows: tail = "_full_" + w[0] + "-" + w[1] + ".map" windows1.append('/'.join(root_name_1) + tail) windows2.append('/'.join(root_name_2) + tail) files = [genome_file] + windows1 + windows2 tpm = tb_parse_mapping.tb_parse_mapping() metadata = {'enzyme_name' : 'MboI', 'mapping' : ['iter', 'iter'], 'expt_name' = 'test'} tpm_files, tpm_meta = tpm.run(files, metadata) Fragment based mapping: .. code-block:: python from tool import tb_parse_mapping genome_file = 'genome.fasta' root_name_1 = "/tmp/data/expt_source_1".split root_name_2 = "/tmp/data/expt_source_2".split windows = [[1,100]] start = windows[0][0] end = windows[0][1] window1_1 = '/'.join(root_name_1) + "_full_" + start + "-" + end + ".map" window1_2 = '/'.join(root_name_1) + "_frag_" + start + "-" + end + ".map" window2_1 = '/'.join(root_name_2) + "_full_" + start + "-" + end + ".map" window2_2 = '/'.join(root_name_2) + "_frag_" + start + "-" + end + ".map" files = [ genome_file, window1_1, window1_2, window2_1, window2_2, ] tpm = tb_parse_mapping.tb_parse_mapping() metadata = {'enzyme_name' : 'MboI', 'mapping' : ['frag', 'frag'], 'expt_name' = 'test'} tpm_files, tpm_meta = tpm.run(files, metadata) """ genome_file = input_files[0] enzyme_name = metadata['enzyme_name'] mapping_list = metadata['mapping'] expt_name = metadata['expt_name'] root_name = input_files[1].split("/") reads = "/".join(root_name[0:-1]) + '/' genome_seq = parse_fasta(genome_file) chromosome_meta = [] for k in genome_seq: chromosome_meta.append([k, len(genome_seq[k])]) # input and output share most metadata output_metadata = {'chromosomes': chromosome_meta} if mapping_list[0] == mapping_list[1]: if mapping_list[0] == 'iter': window1_1 = input_files[1] window1_2 = input_files[2] window1_3 = input_files[3] window1_4 = input_files[4] window2_1 = input_files[5] window2_2 = input_files[6] window2_3 = input_files[7] window2_4 = input_files[8] read_iter = reads + expt_name + '_iter.tsv' self.tb_parse_mapping_iter(genome_seq, enzyme_name, window1_1, window1_2, window1_3, window1_4, window2_1, window2_2, window2_3, window2_4, read_iter) # results = compss_wait_on(results) return ([read_iter], output_metadata) elif mapping_list[0] == 'frag': window1_full = input_files[1] window1_frag = input_files[2] window2_full = input_files[3] window2_frag = input_files[4] read_frag = reads + expt_name + '_frag.tsv' self.tb_parse_mapping_frag(genome_seq, enzyme_name, window1_full, window1_frag, window2_full, window2_frag, read_frag) # results = compss_wait_on(results) return ([read_frag], output_metadata) reads = None return ([reads], output_metadata)