def run_batch(self, batch): c = config.Configuration() remove = {} n = 0 start = time.time() for track in batch: try: batch[track] = self.transform(batch[track], track) if batch[track] == None: remove[track] = True except Exception as e: print(red(e)) traceback.print_exc() remove[track] = True n = n + 1 t = time.time() p = float(n) / len(batch) eta = (1.0 - p) * ((1.0 / p) * (t - start)) / 3600 print('{:2d}'.format(self.index), 'progress:', '{:7.5f}'.format(p), 'ETA:', '{:8.6f}'.format(eta)) if n % 1000 == 0: gc.collect() for track in remove: batch.pop(track, None) # if there is no output, don't write anything self.output_batch(batch) self.on_exit_worker() exit()
def transform(self, track, track_name): c = config.Configuration() with open(os.path.join(self.get_previous_job_directory(), track), 'r') as json_file: kmers = json.load(json_file) t = c.tracks[track_name] for kmer in kmers: if kmer.find('N') != -1: continue seed = sum(list(map(lambda s: ord(s), kmer))) if not kmer in self.kmers: self.kmers[kmer] = {} self.kmers[kmer]['loci'] = kmers[kmer]['loci'] self.kmers[kmer]['total'] = 0 self.kmers[kmer]['count'] = 0 self.kmers[kmer]['doubt'] = 0 self.kmers[kmer]['tracks'] = kmers[kmer]['tracks'] self.kmers[kmer]['reference'] = kmers[kmer]['reference'] self.kmers[kmer]['interest_masks'] = {} for locus in self.kmers[kmer]['loci']: self.kmers[kmer]['loci'][locus]['masks'] = {self.kmers[kmer]['loci'][locus]['seq']['left']: True, self.kmers[kmer]['loci'][locus]['seq']['right']: True} for locus in self.kmers[kmer]['loci']: tokens = locus.split('_') if 'inside' in locus or (tokens[0].lower() == t.chrom.lower() and int(tokens[1]) >= t.begin and int(tokens[1]) < t.end): self.kmers[kmer]['interest_masks'].update(self.kmers[kmer]['loci'][locus]['masks']) return None
def import_lp_values(self, path='solution.mps'): c = config.Configuration() self.solution = [] for path in self.paths: self.solution.append([0.0] * len(self.tracks)) var_index = {} regex = re.compile('[^a-zA-Z0-9]') for p, path in enumerate(self.paths): for track in self.tracks: name = regex.sub('_', track) var_index['lp' + str(p) + 'c' + name] = self.tracks[track]['index'] with open( os.path.join(self.get_current_job_directory(), 'solution.mps'), 'r') as f: status = f.readline() objective = f.readline() line = f.readline() while (line): tokens = line.split() name = tokens[1] value = float(tokens[2]) if name.startswith('lp'): index = var_index[name] path = int(name[2:name.find('c')]) self.solution[path][index] = value line = f.readline()
def load_inputs(self): c = config.Configuration() self.kmers = {} tracks = self.load_previous_job_results() self.half_mers = {} n = 0 for track in tracks: n += 1 print(cyan(track)) with open( os.path.join(self.get_previous_job_directory(), tracks[track]), 'r') as json_file: kmers = json.load(json_file) for kmer in kmers: if kmers[kmer]['gap'] != -1: left = kmer[:c.hsize] right = kmer[-c.hsize:] self.kmers[kmer] = kmers[kmer] self.kmers[kmer]['count'] = 0 self.kmers[kmer]['doubt'] = 0 if not left in self.half_mers: self.half_mers[left] = {} self.half_mers[left][right] = kmer left = reverse_complement(left) right = reverse_complement(right) if not right in self.half_mers: self.half_mers[right] = {} self.half_mers[right][left] = kmer print(len(self.kmers), 'kmers') self.export_accelerator_input() self.round_robin()
def reduce(self): c = config.Configuration() self.merge_counts() with open(os.path.join(self.get_current_job_directory(), 'kmers.json'), 'w') as json_file: json.dump(self.kmers, json_file, indent=4, sort_keys=True) # output kmers per track self.tracks = {} for kmer in self.kmers: for track in self.kmers[kmer]['tracks']: if not track in self.tracks: self.tracks[track] = {} self.tracks[track][kmer] = self.kmers[kmer] for track in self.tracks: with open( os.path.join(self.get_current_job_directory(), track + '.json'), 'w') as json_file: json.dump(self.tracks[track], json_file, indent=4, sort_keys=True) with open( os.path.join(self.get_current_job_directory(), 'batch_merge.json'), 'w') as json_file: json.dump({track: track + '.json' for track in self.tracks}, json_file, indent=4)
def export_solution(self): c = config.Configuration() self.errors = self.solution[len(self.tracks):] for track in self.tracks: index = self.tracks[track]['index'] self.tracks[track]['coverage'] = self.solution[index] self.tracks[track]['lp_kmers'] = [] for index, kmer in enumerate(self.lp_kmers): for track in kmer['tracks']: self.tracks[track]['lp_kmers'].append(kmer) self.find_rounding_break_points() print('Rounding', len(self.tracks), 'tracks') name = 'merge.bed' if not c.cgc else 'genotypes_' + (c.fastq.split( '/')[-1] if c.fastq else c.bam.split('/')[-1]) + '.bed' path = os.path.join( os.getcwd() if c.cgc else self.get_current_job_directory(), name) with open(path, 'w') as bed_file: bed_file.write('CHROM\tBEGIN\tEND\tLP_GENOTYPE\tLP_VALUE\tID\n') for track in self.tracks: t = c.tracks[track] index = self.tracks[track]['index'] g = self.round_genotype(self.solution[index], t.svtype) bed_file.write(t.chrom + '\t' + str(t.begin) + '\t' + str(t.end) + '\t' + str(g[1]) + '\t' + str(self.solution[index]) + '\t' + str(t.id) + '\n') self.export_kmers()
def transform(self): c = config.Configuration() cpp_dir = os.path.join(os.path.dirname(__file__), '../../cpp') if c.bam: command = os.path.join(cpp_dir, "counter.out") + " " + str( self.index) + " " + self.get_current_job_directory( ) + " " + c.bam + " " + str(self.num_threads) + " " + str( self._counter_mode) + " " + ( "1" if c.debug else "0") + " " + ("1" if c.simulation else "0") output = subprocess.call(command, shell=True) else: for i, fastq_file in enumerate(c.fastq): command = os.path.join(cpp_dir, "counter.out") + " " + str( self.index) + " " + self.get_current_job_directory( ) + " " + fastq_file + " " + str( self.num_threads) + " " + str( self._counter_mode) + " " + ( "1" if c.debug else "0") + " " + ("1" if c.simulation else "0") output = subprocess.call(command, shell=True) command = "mv " + os.path.join( self.get_current_job_directory(), 'c_batch_' + str(self.index) + '.json') + " " + os.path.join( self.get_current_job_directory(), 'c_batch_' + str(self.index) + '.' + str(i) + '.json') output = subprocess.call(command, shell=True) exit()
def reduce(self): c = config.Configuration() self.kmers = {} self.merge_counts() self.counts = list(map(lambda kmer: self.kmers[kmer], self.kmers)) self.mean = numpy.mean(self.counts) self.std = numpy.std(self.counts) print(len(self.counts)) print('mean:', self.mean) print('std:', self.std) # filter outliers self.counts = list(filter(lambda x: x < 3 * self.mean, self.counts)) self.mean = numpy.mean(self.counts) self.std = numpy.std(self.counts) print(len(self.counts)) print('mean:', self.mean) print('std:', self.std) # filter outliers self.counts = list(filter(lambda x: x < 2 * self.mean, self.counts)) self.mean = numpy.mean(self.counts) self.std = numpy.std(self.counts) print(len(self.counts)) print('mean:', self.mean) print('std:', self.std) # self.plot_reference_distribution([ self.counts[i] for i in sorted(random.sample(xrange(len(self.counts)), 10000)) ]) with open(os.path.join(self.get_current_job_directory(), 'stats_' + str(c.ksize) + '.json'), 'w') as json_file: json.dump({ 'mean': self.mean, 'std': self.std }, json_file, sort_keys = True, indent = 4)
def load_inputs(self): c = config.Configuration() extract_whole_genome() self.tracks = self.load_tracks() self.round_robin( self.tracks, filter_func=lambda track: track.end - track.begin > 1000000)
def export_solution(self): c = config.Configuration() for track in self.tracks: index = self.tracks[track]['index'] self.tracks[track]['coverage'] = [] self.tracks[track]['lp_kmers'] = [] for index, kmer in enumerate(self.lp_kmers): for track in kmer['tracks']: self.tracks[track]['lp_kmers'].append(kmer) for p, path in enumerate(self.paths): print('Genotyping sample', path) self.find_rounding_break_points() with open( os.path.join(self.get_current_job_directory(), 'merge_' + str(p) + '.bed'), 'w') as bed_file: with open( os.path.join(path, 'CgcIntegerProgrammingJob', 'union.bed'), 'w') as cluster_file: bed_file.write( 'CHROM\tBEGIN\tEND\tLP_GENOTYPE\tLP_VALUE\tID\n') cluster_file.write( 'CHROM\tBEGIN\tEND\tLP_GENOTYPE\tLP_VALUE\tID\n') for track in self.tracks: t = c.tracks[track] index = self.tracks[track]['index'] g = self.round_genotype(self.solution[p][index], t.svtype) line = t.chrom + '\t' + str(t.begin) + '\t' + str(t.end) + '\t' +\ str(g[1]) + '\t' + str(self.solution[p][index]) + '\t' + str(t.id) + '\n' bed_file.write(line) cluster_file.write(line)
def load_inputs(self): c = config.Configuration() with open(os.path.join(self.get_previous_job_directory(), 'kmers.json'), 'r') as json_file: self.kmers = json.load(json_file) print(len(self.kmers)) gc_coverage_job = depth.ChromosomeGcContentEstimationJob() with open(os.path.join(gc_coverage_job.get_current_job_directory(), 'coverage.json'), 'r') as json_file: self.coverage = json.load(json_file) print('Adjusting GC coverage for', green(len(self.kmers)), 'kmers') n = 0 for kmer in self.kmers: self.transform(self.kmers[kmer], kmer) n += 1 if n % 1000 == 0: print(n, 'out of', len(self.kmers)) self.tracks = {} for kmer in self.kmers: for track in self.kmers[kmer]['tracks']: if not track in self.tracks: self.tracks[track] = {} self.tracks[track][kmer] = self.kmers[kmer] for track in self.tracks: print('exporting track', track) with open(os.path.join(self.get_current_job_directory(), 'indicator_kmers_' + track + '.json'), 'w') as json_file: json.dump(self.tracks[track], json_file, indent = 4, sort_keys = True) with open(os.path.join(self.get_current_job_directory(), 'batch_merge.json'), 'w') as json_file: json.dump({track: 'indicator_kmers_' + track + '.json' for track in self.tracks}, json_file, indent = 4) exit()
def extract_boundary_gapped_kmers(self, track): c = config.Configuration() sequence = track.extract_base_sequence() begin = track.slack end = len(sequence) - track.slack gapped_kmers = {} # kmer = track.sequence[begin - c.hsize - 2:begin + 3 + c.hsize] prefix = track.sequence[begin - c.hsize - 2 - c.ksize:begin - c.hsize - 2] suffix = track.sequence[begin + 3 + c.hsize:begin + 3 + c.hsize + c.ksize] gapped_kmers[kmer] = {'left': prefix, 'right': suffix, 'side': 'inner'} # kmer = track.sequence[end - c.hsize - 2:end + 3 + c.hsize] prefix = track.sequence[end - c.hsize - 2 - c.size:end - c.hsize - 2] suffix = track.sequence[end + 3 + c.hsize:end + 3 + c.hsize + c.ksize] gapped_kmers[kmer] = {'left': prefix, 'right': suffix, 'side': 'inner'} # kmer = track.sequence[begin - 2 - c.hsize:begin + 3] + track.sequence[end - 2:end + 3 + c.hsize] prefix = track.sequence[begin - c.hsize - 2 - c.ksize:begin - c.hsize - 2] suffix = track.sequence[end + 3 + c.hsize:end + 3 + c.hsize + c.ksize] gapped_kmers[kmer] = {'left': prefix, 'right': suffix, 'side': 'outer'} return gapped_kmers
def extract_chromosome(chromosome): chromosome = chromosome.lower() if chromosome in chroms: print(yellow('loading from cache')) return chroms[chromosome] else: print(red('chromosome not found'), chromosome) if whole_genome_extracted: return None c = config.Configuration() sequence = '' print(yellow(c.reference)) ref = open(c.reference) line = ref.readline().lower().strip() found = False while True: if line.startswith('>chr'): chrom = line[line.find('>') + 1:] if chrom == chromosome: print('extracting ' + chrom) while True: line = ref.readline().lower().strip() if line.startswith('>') or len(line) == 0: print(line) chroms[chromosome] = sequence return sequence sequence += line.upper() line = ref.readline().lower().strip() if len(line) == 0: break
def generate_mps_linear_program(self): c = config.Configuration() problem = LpProblem("Nebula", LpMinimize) variables = [None] * len(self.paths) * len(self.tracks) n = len(self.paths) regex = re.compile('[^a-zA-Z0-9]') for p, path in enumerate(self.paths): for track in self.tracks: name = regex.sub('_', track) prefix = 'lp' + str(p) + 'c' + name variables[p * len(self.tracks) + self.tracks[track]['index']] = LpVariable( prefix, 0, 1) self.add_mps_error_absolute_value_constraints(problem, variables) self.add_mps_coverage_diff_absolute_value_constraints( problem, variables) expr = LpAffineExpression( [(variables[n * (len(self.tracks) + len(self.lp_kmers)) + i], self.lp_kmers[i % len(self.lp_kmers)]['weight']) for i in range(0, n * len(self.lp_kmers))] + [(variables[j], 1.0 / n) for j in range(n * (len(self.tracks) + 2 * len(self.lp_kmers)), len(variables))]) problem += expr self.add_mps_optimization_constraints(problem, variables) return problem, variables
def extract_chromosomes(chromosomes): c = config.Configuration() m = 0 ref = open(c.reference) line = ref.readline().lower().strip() found = False sequence = '' while True: if line.startswith('>chr'): chrom = line[line.find('>') + 1:].strip().lower() if chrom in chromosomes: print('extracting ' + chrom) while True: line = ref.readline().lower().strip() if line.startswith('>') or len(line) == 0: print(len(sequence), 'bases') yield sequence, chrom sequence = '' found = True m += 1 if m == len(chromosomes): return break sequence += line.upper() # this is to avoid skipping the last line we read for the previous chromosome (header of next) if found: found = False continue line = ref.readline().lower().strip() if len(line) == 0: break
def load_inputs(self): c = config.Configuration() batch_size = 10 tracks = {} for i in range(0, 400 / batch_size): print( yellow( '============================================================================' )) print( yellow( '============================================================================' )) print( yellow('=================================' + str(i) + '=======================================')) print( yellow( '============================================================================' )) print( yellow( '============================================================================' )) job = UnifiedGenotypingJob(begin=1000 + i * batch_size, end=1000 + (i + 1) * batch_size, genotyping_batch=i) job.execute() exit()
def extract_kmers_with_contigs(self): c = config.Configuration() contig_index = {} output = {} for track in self.tracks: contig_index[self.tracks[track].contig] = self.tracks[track] n = 0 self.tracks = {} print(len(contig_index), 'total tracks') for read in self.contigs.fetch(): if read.query_name in contig_index: t = contig_index[read.query_name] kmers = {} kmers.update( self.extract_assembly_kmers(t, read.query_sequence)) kmers.update(self.extract_mapping_kmers(t)) if len(kmers) > 0: self.tracks[t.id] = kmers #path = os.path.join(self.get_current_job_directory(), t.id + '.json') #with open(path, 'w') as json_file: # json.dump(kmers, json_file, indent = 4) #output[t.id] = path n += 1 if n % 1000 == 0: print(n)
def simulate(): c = config.Configuration() if c.seed == 0: print(red('Argument error. Must provide --seed')) load_tracks() job = Simulation() job.execute()
def get_current_job_directory(self): c = config.Configuration() if c.simulation: s = Job.get_current_job_directory(self) print(yellow(s)) return s else: return os.path.abspath(os.path.join(self.get_output_directory(), self._name))
def reduce(self): c = config.Configuration() self.index_kmers() self.index_tracks() self.calculate_residual_coverage() self.update_counts() self.solve() self.gather_genotype_statistics()
def debug_terminate(): c = config.Configuration() if c.debug: print( magenta( 'DEBUG TERMINATE *********************************************************************************************' )) exit()
def __init__(self, **kwargs): c = config.Configuration() self.index = -1 self.batch = {} self.children = {} self.resume_from_reduce = c.reduce for k, v in kwargs.items(): setattr(self, k, v)
def cgc_genotype(): c = config.Configuration() load_tracks() job = CgcCounterJob(resume_from_reduce=c.reduce) stats = job.execute() config.Configuration.update(stats) job = CgcIntegerProgrammingJob() job.execute()
def merge_counts(self): c = config.Configuration() print('merging kmer counts...') for batch in self.load_output(): for kmer in batch: self.kmers[kmer]['count'] += batch[kmer]['count'] if 'loci' in batch[kmer]: self.kmers[kmer]['loci'].update(batch[kmer]['loci'])
def generate_linear_program(self): print('generating linear program') c = config.Configuration() globals()['cplex'] = __import__('cplex') problem = cplex.Cplex() problem.objective.set_sense(problem.objective.sense.minimize) # the coverage of each event names = [''] * len(self.tracks) for track in self.tracks: tokens = track.split('_') names[self.tracks[track]['index']] = 'c' + tokens[1] problem.variables.add( names=names, ub=[1.0] * len(self.tracks), ) # the real-valued error parameter for inner_kmer problem.variables.add( names=[ 'e' + str(index) for index, kmer in enumerate(self.lp_kmers) ], lb=[(kmer['count'] - kmer['coverage'] * kmer['residue'] - kmer['coverage'] * sum(kmer['tracks'][track] for track in kmer['tracks'])) for kmer in self.lp_kmers], #obj = [1.0] * len(self.lp_kmers), ) # absolute value of the inner_kmer error parameter problem.variables.add( names=[ 'l' + str(index) for index, kmer in enumerate(self.lp_kmers) ], obj=[1.0] * len(self.lp_kmers), ) # constraints n = 0 start = time.time() for index, kmer in enumerate(self.lp_kmers): ind = list( map(lambda track: self.tracks[track]['index'], kmer['tracks'])) ind.append(len(self.tracks) + index) val = list( map(lambda track: kmer['coverage'] * kmer['tracks'][track], kmer['tracks'])) val.append(1.0) problem.linear_constraints.add( lin_expr=[cplex.SparsePair( ind=ind, val=val, )], rhs=[kmer['count']], senses=['E']) self.add_error_absolute_value_constraints(problem, index) n = n + 1 if n % 1000 == 0: t = time.time() p = float(n) / len(self.lp_kmers) eta = (1.0 - p) * ((1.0 / p) * (t - start)) / 3600 return problem
def transform(self, track, track_name): print(green(track_name)) c = config.Configuration() gapped_kmers = self.extract_gapped_kmers(track) path = os.path.join(self.get_current_job_directory(), track_name + '.json') with open(path, 'w') as json_file: json.dump(gapped_kmers, json_file, sort_keys=True, indent=4) return track_name + '.json'
def load_inputs(self): c = config.Configuration() self.gc = {} self.window_size = 96 for i in range(0, self.window_size + 1): self.gc[i] = {} self.chromosomes = extract_whole_genome() self.load_reference_counts_provider() self.round_robin(self.chromosomes)
def calculate_residual_coverage(self): c = config.Configuration() for kmer in self.lp_kmers: r = 0 for track in kmer['tracks']: r += kmer['tracks'][track] # put an upperbound on a kmer's impact on LP score kmer['count'] = min(kmer['count'], kmer['coverage'] * kmer['reference']) kmer['residue'] = kmer['reference'] - r
def estimate_likelihood(self, track, distribution): c = config.Configuration() likelihood = 0.0 for index, kmer in enumerate(self.tracks[track]['lp_kmers']): l = distribution[kmer['type']].log_pmf(kmer['count'] - kmer['residue'] * kmer['coverage']) kmer['likelihood'] = l likelihood += l return likelihood / (index + 1)
def extract_kmers(k=32, canonical=True, *args): c = config.Configuration() kmers = {} for s in args: for i in range(0, len(s) - k + 1): kmer = canonicalize(s[i:i + k]) if canonical else s[i:i + k] if not kmer in kmers: kmers[kmer] = 0 kmers[kmer] += 1 return kmers