def omsim(settings): # cd to the directory containing the configuration file os.chdir(settings.directory) # process input cmaps = import_input(settings) print('Imported ' + str(sum(cmaps[iname].count() for iname in cmaps)) + ' nicks in ' + str(sum(cmaps[iname].seq_len() for iname in cmaps)) + 'bp.') cmaps = KMP(settings, cmaps) # write processed input write_processed_input(settings, cmaps) # filter input for enzymes / files we need seqs, seq_lens, fns = filter_input(settings, cmaps) prev = 0 cum_seq_lens = [] for seq_len in seq_lens: curr = prev + seq_len cum_seq_lens += [curr] prev = curr print('Using ' + str(sum(len(f) for f in fns)) + ' nicks in ' + str(sum(seq_lens)) + 'bp.') #compute reverse nicking sites rns = get_rns(settings, fns, seq_lens) #estimate number of chips based on expected coverage if settings.chips == 0: temp = int(sum(seq_lens) * settings.coverage / (settings.scans_per_chip * settings.get_scan_size())) settings.chips = temp if temp > 1 else 1 #estimate coverage settings.estimated_coverage = int(settings.get_scan_size() * settings.scans_per_chip * settings.chips / float(sum(seq_lens))) print('Generating reads on ' + str(settings.chips) + ' chip' + ('' if settings.chips == 1 else 's') + ', estimated coverage: ' + str(settings.estimated_coverage) + 'x.') noise = Noise(settings) bnx = BNX(settings, noise) # generate reads for chip in range(1, settings.chips + 1): chip_settings = {'size': 0, 'scans': 0, 'chip_id': '20249,11843,07/17/2014,840014289', 'run_id': str(chip), 'flowcell': 1, 'molecule_count': 0, 'bpp': 425, 'stretch_factor': noise.chip_stretch_factor()} chip_settings['bpp'] /= chip_settings['stretch_factor'] molecules = {} for label in settings.labels: molecules[label] = [] # generate reads moleculeID = 0 relative_stretch = [] for scan in range(1, settings.scans_per_chip + 1): chip_settings['scans'] += 1 scan_stretch = noise.scan_stretch_factor(chip_settings['stretch_factor']) for l, m, meta in noise.generate_scan(seq_lens, cum_seq_lens, fns, rns): moleculeID += 1 molecule = {} for label in settings.labels: molecule[label] = [] for nick in m: molecule[nick[1]['label']].append(nick[0]) for label in settings.labels: if settings.min_nicks <= len(molecule[label]): molecules[label].append((l, molecule[label], chip_settings['scans'], meta)) relative_stretch.append(noise.mol_stretch_factor(scan_stretch) / chip_settings['stretch_factor']) chip_settings['molecule_count'] += 1 chip_settings['size'] += l # write output for label in settings.labels: moleculeID = 0 ofile = open(settings.prefix + '.' + label + '.' + str(chip) + '.bnx', 'w') #bedfile = open(settings.prefix + '.' + label + '.' + str(chip) + '.bed', 'w') bnx.write_bnx_header(ofile, label, chip_settings) for l, m, s, meta in molecules[label]: moleculeID += 1 bnx.write_bnx_entry((moleculeID, l, s), m, ofile, chip_settings, relative_stretch[moleculeID - 1]) #for idx, mol in enumerate(meta): #bedfile.write(seqs[mol[0]] + '\t' + str(mol[1]) + '\t' + str(mol[1] + l) + '\t' + str(moleculeID) + ('.' + str(idx) if len(meta) > 1 else '') + '\n') ofile.close() #bedfile.close() print('Finished chip ' + str(chip) + '/' + str(settings.chips)) print('Finished processing ' + settings.name + '.\n')
def omsim(settings): # cd to the directory containing the configuration file os.chdir(settings.directory) # process input cmaps = import_input(settings) print('Imported ' + str(sum(cmaps[iname].count() for iname in cmaps)) + ' nicks in ' + str(sum(cmaps[iname].seq_len() for iname in cmaps)) + 'bp.') cmaps = KMP(settings, cmaps) # write processed input write_processed_input(settings, cmaps) # filter input for enzymes / files we need seqs, seq_lens, fns = filter_input(settings, cmaps) prev = 0 cum_seq_lens = [] for seq_len in seq_lens: curr = prev + seq_len cum_seq_lens += [curr] prev = curr print('Using ' + str(sum(len(f) for f in fns)) + ' nicks in ' + str(sum(seq_lens)) + 'bp.') #compute reverse nicking sites rns = get_rns(settings, fns, seq_lens) #estimate number of chips based on expected coverage if settings.chips == 0: temp = int( sum(seq_lens) * settings.coverage / (settings.scans_per_chip * settings.get_scan_size())) settings.chips = temp if temp > 1 else 1 #estimate coverage settings.estimated_coverage = int( settings.get_scan_size() * settings.scans_per_chip * settings.chips / float(sum(seq_lens))) print('Generating reads on ' + str(settings.chips) + ' chip' + ('' if settings.chips == 1 else 's') + ', estimated coverage: ' + str(settings.estimated_coverage) + 'x.') noise = Noise(settings) bnx = BNX(settings, noise) # generate reads for chip in range(1, settings.chips + 1): chip_settings = { 'size': 0, 'scans': 0, 'chip_id': '20249,11843,07/17/2014,840014289', 'run_id': str(chip), 'flowcell': 1, 'molecule_count': 0, 'bpp': 425, 'stretch_factor': noise.chip_stretch_factor() } chip_settings['bpp'] /= chip_settings['stretch_factor'] molecules = {} for label in settings.labels: molecules[label] = [] # generate reads moleculeID = 0 relative_stretch = [] for scan in range(1, settings.scans_per_chip + 1): chip_settings['scans'] += 1 scan_stretch = noise.scan_stretch_factor( chip_settings['stretch_factor']) for l, m, meta in noise.generate_scan(seq_lens, cum_seq_lens, fns, rns): moleculeID += 1 molecule = {} for label in settings.labels: molecule[label] = [] for nick in m: molecule[nick[1]['label']].append(nick[0]) for label in settings.labels: if settings.min_nicks <= len(molecule[label]): molecules[label].append( (l, molecule[label], chip_settings['scans'], meta)) relative_stretch.append( noise.mol_stretch_factor(scan_stretch) / chip_settings['stretch_factor']) chip_settings['molecule_count'] += 1 chip_settings['size'] += l # write output for label in settings.labels: moleculeID = 0 ofile = open( settings.prefix + '.' + label + '.' + str(chip) + '.bnx', 'w') #bedfile = open(settings.prefix + '.' + label + '.' + str(chip) + '.bed', 'w') bnx.write_bnx_header(ofile, label, chip_settings) for l, m, s, meta in molecules[label]: moleculeID += 1 bnx.write_bnx_entry((moleculeID, l, s), m, ofile, chip_settings, relative_stretch[moleculeID - 1]) #for idx, mol in enumerate(meta): #bedfile.write(seqs[mol[0]] + '\t' + str(mol[1]) + '\t' + str(mol[1] + l) + '\t' + str(moleculeID) + ('.' + str(idx) if len(meta) > 1 else '') + '\n') ofile.close() #bedfile.close() print('Finished chip ' + str(chip) + '/' + str(settings.chips)) print('Finished processing ' + settings.name + '.\n')