예제 #1
0
 def run_batch(self, batch):
     c = config.Configuration()
     remove = {}
     n = 0
     start = time.time()
     for track in batch:
         try:
             batch[track] = self.transform(batch[track], track)
             if batch[track] == None:
                 remove[track] = True
         except Exception as e:
             print(red(e))
             traceback.print_exc()
             remove[track] = True
         n = n + 1
         t = time.time()
         p = float(n) / len(batch)
         eta = (1.0 - p) * ((1.0 / p) * (t - start)) / 3600
         print('{:2d}'.format(self.index), 'progress:', '{:7.5f}'.format(p), 'ETA:', '{:8.6f}'.format(eta))
         if n % 1000 == 0:
             gc.collect()
     for track in remove:
         batch.pop(track, None)
     # if there is no output, don't write anything
     self.output_batch(batch)
     self.on_exit_worker()
     exit()
예제 #2
0
 def transform(self, track, track_name):
     c = config.Configuration()
     with open(os.path.join(self.get_previous_job_directory(), track), 'r') as json_file:
         kmers = json.load(json_file)
         t = c.tracks[track_name]
         for kmer in kmers:
             if kmer.find('N') != -1:
                 continue
             seed = sum(list(map(lambda s: ord(s), kmer)))
             if not kmer in self.kmers:
                 self.kmers[kmer] = {}
                 self.kmers[kmer]['loci'] = kmers[kmer]['loci']
                 self.kmers[kmer]['total'] = 0
                 self.kmers[kmer]['count'] = 0
                 self.kmers[kmer]['doubt'] = 0
                 self.kmers[kmer]['tracks'] = kmers[kmer]['tracks']
                 self.kmers[kmer]['reference'] = kmers[kmer]['reference']
                 self.kmers[kmer]['interest_masks'] = {}
                 for locus in self.kmers[kmer]['loci']:
                     self.kmers[kmer]['loci'][locus]['masks'] = {self.kmers[kmer]['loci'][locus]['seq']['left']: True, self.kmers[kmer]['loci'][locus]['seq']['right']: True}
             for locus in self.kmers[kmer]['loci']:
                 tokens = locus.split('_')
                 if 'inside' in locus or (tokens[0].lower() == t.chrom.lower() and int(tokens[1]) >= t.begin and int(tokens[1]) < t.end):
                     self.kmers[kmer]['interest_masks'].update(self.kmers[kmer]['loci'][locus]['masks'])
     return None
예제 #3
0
 def import_lp_values(self, path='solution.mps'):
     c = config.Configuration()
     self.solution = []
     for path in self.paths:
         self.solution.append([0.0] * len(self.tracks))
     var_index = {}
     regex = re.compile('[^a-zA-Z0-9]')
     for p, path in enumerate(self.paths):
         for track in self.tracks:
             name = regex.sub('_', track)
             var_index['lp' + str(p) + 'c' +
                       name] = self.tracks[track]['index']
     with open(
             os.path.join(self.get_current_job_directory(), 'solution.mps'),
             'r') as f:
         status = f.readline()
         objective = f.readline()
         line = f.readline()
         while (line):
             tokens = line.split()
             name = tokens[1]
             value = float(tokens[2])
             if name.startswith('lp'):
                 index = var_index[name]
                 path = int(name[2:name.find('c')])
                 self.solution[path][index] = value
             line = f.readline()
예제 #4
0
 def load_inputs(self):
     c = config.Configuration()
     self.kmers = {}
     tracks = self.load_previous_job_results()
     self.half_mers = {}
     n = 0
     for track in tracks:
         n += 1
         print(cyan(track))
         with open(
                 os.path.join(self.get_previous_job_directory(),
                              tracks[track]), 'r') as json_file:
             kmers = json.load(json_file)
             for kmer in kmers:
                 if kmers[kmer]['gap'] != -1:
                     left = kmer[:c.hsize]
                     right = kmer[-c.hsize:]
                     self.kmers[kmer] = kmers[kmer]
                     self.kmers[kmer]['count'] = 0
                     self.kmers[kmer]['doubt'] = 0
                     if not left in self.half_mers:
                         self.half_mers[left] = {}
                     self.half_mers[left][right] = kmer
                     left = reverse_complement(left)
                     right = reverse_complement(right)
                     if not right in self.half_mers:
                         self.half_mers[right] = {}
                     self.half_mers[right][left] = kmer
     print(len(self.kmers), 'kmers')
     self.export_accelerator_input()
     self.round_robin()
예제 #5
0
 def reduce(self):
     c = config.Configuration()
     self.merge_counts()
     with open(os.path.join(self.get_current_job_directory(), 'kmers.json'),
               'w') as json_file:
         json.dump(self.kmers, json_file, indent=4, sort_keys=True)
     # output kmers per track
     self.tracks = {}
     for kmer in self.kmers:
         for track in self.kmers[kmer]['tracks']:
             if not track in self.tracks:
                 self.tracks[track] = {}
             self.tracks[track][kmer] = self.kmers[kmer]
     for track in self.tracks:
         with open(
                 os.path.join(self.get_current_job_directory(),
                              track + '.json'), 'w') as json_file:
             json.dump(self.tracks[track],
                       json_file,
                       indent=4,
                       sort_keys=True)
     with open(
             os.path.join(self.get_current_job_directory(),
                          'batch_merge.json'), 'w') as json_file:
         json.dump({track: track + '.json'
                    for track in self.tracks},
                   json_file,
                   indent=4)
예제 #6
0
 def export_solution(self):
     c = config.Configuration()
     self.errors = self.solution[len(self.tracks):]
     for track in self.tracks:
         index = self.tracks[track]['index']
         self.tracks[track]['coverage'] = self.solution[index]
         self.tracks[track]['lp_kmers'] = []
     for index, kmer in enumerate(self.lp_kmers):
         for track in kmer['tracks']:
             self.tracks[track]['lp_kmers'].append(kmer)
     self.find_rounding_break_points()
     print('Rounding', len(self.tracks), 'tracks')
     name = 'merge.bed' if not c.cgc else 'genotypes_' + (c.fastq.split(
         '/')[-1] if c.fastq else c.bam.split('/')[-1]) + '.bed'
     path = os.path.join(
         os.getcwd() if c.cgc else self.get_current_job_directory(), name)
     with open(path, 'w') as bed_file:
         bed_file.write('CHROM\tBEGIN\tEND\tLP_GENOTYPE\tLP_VALUE\tID\n')
         for track in self.tracks:
             t = c.tracks[track]
             index = self.tracks[track]['index']
             g = self.round_genotype(self.solution[index], t.svtype)
             bed_file.write(t.chrom + '\t' + str(t.begin) + '\t' +
                            str(t.end) + '\t' + str(g[1]) + '\t' +
                            str(self.solution[index]) + '\t' + str(t.id) +
                            '\n')
     self.export_kmers()
예제 #7
0
 def transform(self):
     c = config.Configuration()
     cpp_dir = os.path.join(os.path.dirname(__file__), '../../cpp')
     if c.bam:
         command = os.path.join(cpp_dir, "counter.out") + " " + str(
             self.index) + " " + self.get_current_job_directory(
             ) + " " + c.bam + " " + str(self.num_threads) + " " + str(
                 self._counter_mode) + " " + (
                     "1" if c.debug else "0") + " " + ("1" if c.simulation
                                                       else "0")
         output = subprocess.call(command, shell=True)
     else:
         for i, fastq_file in enumerate(c.fastq):
             command = os.path.join(cpp_dir, "counter.out") + " " + str(
                 self.index) + " " + self.get_current_job_directory(
                 ) + " " + fastq_file + " " + str(
                     self.num_threads) + " " + str(
                         self._counter_mode) + " " + (
                             "1" if c.debug else
                             "0") + " " + ("1" if c.simulation else "0")
             output = subprocess.call(command, shell=True)
             command = "mv " + os.path.join(
                 self.get_current_job_directory(), 'c_batch_' +
                 str(self.index) + '.json') + " " + os.path.join(
                     self.get_current_job_directory(),
                     'c_batch_' + str(self.index) + '.' + str(i) + '.json')
             output = subprocess.call(command, shell=True)
     exit()
예제 #8
0
 def reduce(self):
     c = config.Configuration()
     self.kmers = {}
     self.merge_counts()
     self.counts = list(map(lambda kmer: self.kmers[kmer], self.kmers))
     self.mean = numpy.mean(self.counts)
     self.std = numpy.std(self.counts)
     print(len(self.counts))
     print('mean:', self.mean)
     print('std:', self.std)
     # filter outliers
     self.counts = list(filter(lambda x: x < 3 * self.mean, self.counts))
     self.mean = numpy.mean(self.counts)
     self.std = numpy.std(self.counts)
     print(len(self.counts))
     print('mean:', self.mean)
     print('std:', self.std)
     # filter outliers
     self.counts = list(filter(lambda x: x < 2 * self.mean, self.counts))
     self.mean = numpy.mean(self.counts)
     self.std = numpy.std(self.counts)
     print(len(self.counts))
     print('mean:', self.mean)
     print('std:', self.std)
     #
     self.plot_reference_distribution([ self.counts[i] for i in sorted(random.sample(xrange(len(self.counts)), 10000)) ])
     with open(os.path.join(self.get_current_job_directory(), 'stats_' + str(c.ksize) + '.json'), 'w') as json_file:
         json.dump({ 'mean': self.mean, 'std': self.std }, json_file, sort_keys = True, indent = 4)
예제 #9
0
 def load_inputs(self):
     c = config.Configuration()
     extract_whole_genome()
     self.tracks = self.load_tracks()
     self.round_robin(
         self.tracks,
         filter_func=lambda track: track.end - track.begin > 1000000)
예제 #10
0
 def export_solution(self):
     c = config.Configuration()
     for track in self.tracks:
         index = self.tracks[track]['index']
         self.tracks[track]['coverage'] = []
         self.tracks[track]['lp_kmers'] = []
     for index, kmer in enumerate(self.lp_kmers):
         for track in kmer['tracks']:
             self.tracks[track]['lp_kmers'].append(kmer)
     for p, path in enumerate(self.paths):
         print('Genotyping sample', path)
         self.find_rounding_break_points()
         with open(
                 os.path.join(self.get_current_job_directory(),
                              'merge_' + str(p) + '.bed'), 'w') as bed_file:
             with open(
                     os.path.join(path, 'CgcIntegerProgrammingJob',
                                  'union.bed'), 'w') as cluster_file:
                 bed_file.write(
                     'CHROM\tBEGIN\tEND\tLP_GENOTYPE\tLP_VALUE\tID\n')
                 cluster_file.write(
                     'CHROM\tBEGIN\tEND\tLP_GENOTYPE\tLP_VALUE\tID\n')
                 for track in self.tracks:
                     t = c.tracks[track]
                     index = self.tracks[track]['index']
                     g = self.round_genotype(self.solution[p][index],
                                             t.svtype)
                     line = t.chrom + '\t' + str(t.begin) + '\t' + str(t.end) + '\t' +\
                         str(g[1]) + '\t' + str(self.solution[p][index]) + '\t' + str(t.id) + '\n'
                     bed_file.write(line)
                     cluster_file.write(line)
예제 #11
0
 def load_inputs(self):
     c = config.Configuration()
     with open(os.path.join(self.get_previous_job_directory(), 'kmers.json'), 'r') as json_file:
         self.kmers = json.load(json_file)
     print(len(self.kmers))
     gc_coverage_job = depth.ChromosomeGcContentEstimationJob()
     with open(os.path.join(gc_coverage_job.get_current_job_directory(), 'coverage.json'), 'r') as json_file:
         self.coverage = json.load(json_file)
     print('Adjusting GC coverage for', green(len(self.kmers)), 'kmers')
     n = 0
     for kmer in self.kmers:
         self.transform(self.kmers[kmer], kmer)
         n += 1
         if n % 1000 == 0:
             print(n, 'out of', len(self.kmers))
     self.tracks = {}
     for kmer in self.kmers:
         for track in self.kmers[kmer]['tracks']:
             if not track in self.tracks:
                 self.tracks[track] = {}
             self.tracks[track][kmer] = self.kmers[kmer]
     for track in self.tracks:
         print('exporting track', track)
         with open(os.path.join(self.get_current_job_directory(), 'indicator_kmers_' + track + '.json'), 'w') as json_file:
             json.dump(self.tracks[track], json_file, indent = 4, sort_keys = True)
     with open(os.path.join(self.get_current_job_directory(), 'batch_merge.json'), 'w') as json_file:
         json.dump({track: 'indicator_kmers_' + track + '.json' for track in self.tracks}, json_file, indent = 4)
     exit()
예제 #12
0
 def extract_boundary_gapped_kmers(self, track):
     c = config.Configuration()
     sequence = track.extract_base_sequence()
     begin = track.slack
     end = len(sequence) - track.slack
     gapped_kmers = {}
     #
     kmer = track.sequence[begin - c.hsize - 2:begin + 3 + c.hsize]
     prefix = track.sequence[begin - c.hsize - 2 - c.ksize:begin - c.hsize -
                             2]
     suffix = track.sequence[begin + 3 + c.hsize:begin + 3 + c.hsize +
                             c.ksize]
     gapped_kmers[kmer] = {'left': prefix, 'right': suffix, 'side': 'inner'}
     #
     kmer = track.sequence[end - c.hsize - 2:end + 3 + c.hsize]
     prefix = track.sequence[end - c.hsize - 2 - c.size:end - c.hsize - 2]
     suffix = track.sequence[end + 3 + c.hsize:end + 3 + c.hsize + c.ksize]
     gapped_kmers[kmer] = {'left': prefix, 'right': suffix, 'side': 'inner'}
     #
     kmer = track.sequence[begin - 2 - c.hsize:begin +
                           3] + track.sequence[end - 2:end + 3 + c.hsize]
     prefix = track.sequence[begin - c.hsize - 2 - c.ksize:begin - c.hsize -
                             2]
     suffix = track.sequence[end + 3 + c.hsize:end + 3 + c.hsize + c.ksize]
     gapped_kmers[kmer] = {'left': prefix, 'right': suffix, 'side': 'outer'}
     return gapped_kmers
예제 #13
0
def extract_chromosome(chromosome):
    chromosome = chromosome.lower()
    if chromosome in chroms:
        print(yellow('loading from cache'))
        return chroms[chromosome]
    else:
        print(red('chromosome not found'), chromosome)
        if whole_genome_extracted:
            return None
    c = config.Configuration()
    sequence = ''
    print(yellow(c.reference))
    ref = open(c.reference)
    line = ref.readline().lower().strip()
    found = False
    while True:
        if line.startswith('>chr'):
            chrom = line[line.find('>') + 1:]
            if chrom == chromosome:
                print('extracting ' + chrom)
                while True:
                    line = ref.readline().lower().strip()
                    if line.startswith('>') or len(line) == 0:
                        print(line)
                        chroms[chromosome] = sequence
                        return sequence
                    sequence += line.upper()
        line = ref.readline().lower().strip()
        if len(line) == 0:
            break
예제 #14
0
 def generate_mps_linear_program(self):
     c = config.Configuration()
     problem = LpProblem("Nebula", LpMinimize)
     variables = [None] * len(self.paths) * len(self.tracks)
     n = len(self.paths)
     regex = re.compile('[^a-zA-Z0-9]')
     for p, path in enumerate(self.paths):
         for track in self.tracks:
             name = regex.sub('_', track)
             prefix = 'lp' + str(p) + 'c' + name
             variables[p * len(self.tracks) +
                       self.tracks[track]['index']] = LpVariable(
                           prefix, 0, 1)
     self.add_mps_error_absolute_value_constraints(problem, variables)
     self.add_mps_coverage_diff_absolute_value_constraints(
         problem, variables)
     expr = LpAffineExpression(
         [(variables[n * (len(self.tracks) + len(self.lp_kmers)) + i],
           self.lp_kmers[i % len(self.lp_kmers)]['weight'])
          for i in range(0, n * len(self.lp_kmers))] +
         [(variables[j], 1.0 / n)
          for j in range(n * (len(self.tracks) +
                              2 * len(self.lp_kmers)), len(variables))])
     problem += expr
     self.add_mps_optimization_constraints(problem, variables)
     return problem, variables
예제 #15
0
def extract_chromosomes(chromosomes):
    c = config.Configuration()
    m = 0
    ref = open(c.reference)
    line = ref.readline().lower().strip()
    found = False
    sequence = ''
    while True:
        if line.startswith('>chr'):
            chrom = line[line.find('>') + 1:].strip().lower()
            if chrom in chromosomes:
                print('extracting ' + chrom)
                while True:
                    line = ref.readline().lower().strip()
                    if line.startswith('>') or len(line) == 0:
                        print(len(sequence), 'bases')
                        yield sequence, chrom
                        sequence = ''
                        found = True
                        m += 1
                        if m == len(chromosomes):
                            return
                        break
                    sequence += line.upper()
        # this is to avoid skipping the last line we read for the previous chromosome (header of next)
        if found:
            found = False
            continue
        line = ref.readline().lower().strip()
        if len(line) == 0:
            break
예제 #16
0
 def load_inputs(self):
     c = config.Configuration()
     batch_size = 10
     tracks = {}
     for i in range(0, 400 / batch_size):
         print(
             yellow(
                 '============================================================================'
             ))
         print(
             yellow(
                 '============================================================================'
             ))
         print(
             yellow('=================================' + str(i) +
                    '======================================='))
         print(
             yellow(
                 '============================================================================'
             ))
         print(
             yellow(
                 '============================================================================'
             ))
         job = UnifiedGenotypingJob(begin=1000 + i * batch_size,
                                    end=1000 + (i + 1) * batch_size,
                                    genotyping_batch=i)
         job.execute()
     exit()
예제 #17
0
 def extract_kmers_with_contigs(self):
     c = config.Configuration()
     contig_index = {}
     output = {}
     for track in self.tracks:
         contig_index[self.tracks[track].contig] = self.tracks[track]
     n = 0
     self.tracks = {}
     print(len(contig_index), 'total tracks')
     for read in self.contigs.fetch():
         if read.query_name in contig_index:
             t = contig_index[read.query_name]
             kmers = {}
             kmers.update(
                 self.extract_assembly_kmers(t, read.query_sequence))
             kmers.update(self.extract_mapping_kmers(t))
             if len(kmers) > 0:
                 self.tracks[t.id] = kmers
                 #path = os.path.join(self.get_current_job_directory(), t.id + '.json')
                 #with open(path, 'w') as json_file:
                 #    json.dump(kmers, json_file, indent = 4)
                 #output[t.id] = path
                 n += 1
             if n % 1000 == 0:
                 print(n)
예제 #18
0
def simulate():
    c = config.Configuration()
    if c.seed == 0:
        print(red('Argument error. Must provide --seed'))
    load_tracks()
    job = Simulation()
    job.execute()
예제 #19
0
 def get_current_job_directory(self):
     c = config.Configuration()
     if c.simulation:
         s = Job.get_current_job_directory(self)
         print(yellow(s))
         return s
     else:
         return os.path.abspath(os.path.join(self.get_output_directory(), self._name))
예제 #20
0
 def reduce(self):
     c = config.Configuration()
     self.index_kmers()
     self.index_tracks()
     self.calculate_residual_coverage()
     self.update_counts()
     self.solve()
     self.gather_genotype_statistics()
예제 #21
0
def debug_terminate():
    c = config.Configuration()
    if c.debug:
        print(
            magenta(
                'DEBUG TERMINATE *********************************************************************************************'
            ))
        exit()
예제 #22
0
 def __init__(self, **kwargs):
     c = config.Configuration()
     self.index = -1
     self.batch = {}
     self.children = {}
     self.resume_from_reduce = c.reduce
     for k, v in kwargs.items():
         setattr(self, k, v)
예제 #23
0
def cgc_genotype():
    c = config.Configuration()
    load_tracks()
    job = CgcCounterJob(resume_from_reduce=c.reduce)
    stats = job.execute()
    config.Configuration.update(stats)
    job = CgcIntegerProgrammingJob()
    job.execute()
예제 #24
0
 def merge_counts(self):
     c = config.Configuration()
     print('merging kmer counts...')
     for batch in self.load_output():
         for kmer in batch:
             self.kmers[kmer]['count'] += batch[kmer]['count']
             if 'loci' in batch[kmer]:
                 self.kmers[kmer]['loci'].update(batch[kmer]['loci'])
예제 #25
0
 def generate_linear_program(self):
     print('generating linear program')
     c = config.Configuration()
     globals()['cplex'] = __import__('cplex')
     problem = cplex.Cplex()
     problem.objective.set_sense(problem.objective.sense.minimize)
     # the coverage of each event
     names = [''] * len(self.tracks)
     for track in self.tracks:
         tokens = track.split('_')
         names[self.tracks[track]['index']] = 'c' + tokens[1]
     problem.variables.add(
         names=names,
         ub=[1.0] * len(self.tracks),
     )
     # the real-valued error parameter for inner_kmer
     problem.variables.add(
         names=[
             'e' + str(index) for index, kmer in enumerate(self.lp_kmers)
         ],
         lb=[(kmer['count'] - kmer['coverage'] * kmer['residue'] -
              kmer['coverage'] * sum(kmer['tracks'][track]
                                     for track in kmer['tracks']))
             for kmer in self.lp_kmers],
         #obj = [1.0] * len(self.lp_kmers),
     )
     # absolute value of the inner_kmer error parameter
     problem.variables.add(
         names=[
             'l' + str(index) for index, kmer in enumerate(self.lp_kmers)
         ],
         obj=[1.0] * len(self.lp_kmers),
     )
     # constraints
     n = 0
     start = time.time()
     for index, kmer in enumerate(self.lp_kmers):
         ind = list(
             map(lambda track: self.tracks[track]['index'], kmer['tracks']))
         ind.append(len(self.tracks) + index)
         val = list(
             map(lambda track: kmer['coverage'] * kmer['tracks'][track],
                 kmer['tracks']))
         val.append(1.0)
         problem.linear_constraints.add(
             lin_expr=[cplex.SparsePair(
                 ind=ind,
                 val=val,
             )],
             rhs=[kmer['count']],
             senses=['E'])
         self.add_error_absolute_value_constraints(problem, index)
         n = n + 1
         if n % 1000 == 0:
             t = time.time()
             p = float(n) / len(self.lp_kmers)
             eta = (1.0 - p) * ((1.0 / p) * (t - start)) / 3600
     return problem
예제 #26
0
 def transform(self, track, track_name):
     print(green(track_name))
     c = config.Configuration()
     gapped_kmers = self.extract_gapped_kmers(track)
     path = os.path.join(self.get_current_job_directory(),
                         track_name + '.json')
     with open(path, 'w') as json_file:
         json.dump(gapped_kmers, json_file, sort_keys=True, indent=4)
     return track_name + '.json'
예제 #27
0
 def load_inputs(self):
     c = config.Configuration()
     self.gc = {}
     self.window_size = 96
     for i in range(0, self.window_size + 1):
         self.gc[i] = {}
     self.chromosomes = extract_whole_genome()
     self.load_reference_counts_provider() 
     self.round_robin(self.chromosomes)
예제 #28
0
 def calculate_residual_coverage(self):
     c = config.Configuration()
     for kmer in self.lp_kmers:
         r = 0
         for track in kmer['tracks']:
             r += kmer['tracks'][track]
         # put an upperbound on a kmer's impact on LP score
         kmer['count'] = min(kmer['count'],
                             kmer['coverage'] * kmer['reference'])
         kmer['residue'] = kmer['reference'] - r
예제 #29
0
 def estimate_likelihood(self, track, distribution):
     c = config.Configuration()
     likelihood = 0.0
     for index, kmer in enumerate(self.tracks[track]['lp_kmers']):
         l = distribution[kmer['type']].log_pmf(kmer['count'] -
                                                kmer['residue'] *
                                                kmer['coverage'])
         kmer['likelihood'] = l
         likelihood += l
     return likelihood / (index + 1)
예제 #30
0
def extract_kmers(k=32, canonical=True, *args):
    c = config.Configuration()
    kmers = {}
    for s in args:
        for i in range(0, len(s) - k + 1):
            kmer = canonicalize(s[i:i + k]) if canonical else s[i:i + k]
            if not kmer in kmers:
                kmers[kmer] = 0
            kmers[kmer] += 1
    return kmers