def main(): ''' Run with at least: 1 argument, defining the number of minima to be found optional last argument is file name of data to read into memory (if it hasn't alreay been read) ''' if sys.argv[1] == 'help': print(main.__doc__) return srch_val = int(sys.argv[1]) global np_init_array try: np_init_array except NameError: flat.print_log_msg('Reading data') np_init_array, np_init_array_x = rd.read_data(sys.argv[len(sys.argv)-1]) found_width = custom_binary_search_with_trackback(np_init_array, filt.apply_filter_get_minima, srch_val, trackback_delta=200, trackback_step=20, init_search_location=1000) print('found_width: ', found_width) flat.print_log_msg('Done')
def trackback(wrapper, srch_val, start_search, delta_coarse, step_coarse, step_fine=1): found_more = True # just to enter loop flat.print_log_msg('Starting coarse search') while found_more: # whenever we find more, search continues looking as far as delta width from newly found location found_more = False for i in range(start_search+step_coarse, start_search+delta_coarse, step_coarse): if i>=len(wrapper): break if wrapper[i] == srch_val: found_more = True start_search = i break # By default, step_fine = 1, therefore fine-grained search will happen. It can be disabled by setting step_fine to 0. if step_fine > 0: if step_fine > step_coarse: raise Exception('Error: step_fine is greater than step_coarse') delta_fine = step_coarse found_more = True # just to enter loop flat.print_log_msg('Starting fine search') while found_more: # whenever we find more, search continues looking as far as delta width from there found_more = False for i in range(start_search+step_fine, start_search+delta_fine, step_fine): if i>=len(wrapper): break if wrapper[i] == srch_val: found_more = True start_search = i break return start_search
def main(): # def __init__(self, name, snp_first, snp_last, input_config, breakpoints): # begin = 9411243 end = 48119216 # begin = 46287140 # end = 48119216 breakpoints1 = [10148322, 15250019, 15864313, 16491839, 17748811, 18252127, 18912106, 19637870, 20332293, 20929869, 21190923, 21649595, 22318833, 23231365, 24271200, 24774771, 25035980, 26088085, 27431612, 27666047, 28290149, 28485200, 28761470, 29335757, 29790442, 30972911, 32778127, 33370496, 34413058, 35253882, 35614394, 36328018, 37283402, 38078491, 39227880, 39908770, 40259482, 40965403, 41448115, 41676786, 42689700, 43100808, 43345207, 43799567, 44748107, 45265729, 45789905, 46336509, 46883153, 47465743] # metric = Metric('chr21', cnst.const['orig_data'], breakpoints1, begin, end) metric = Metric('chr21', cnst.return_conf('/nethome/jkpickrell/1kG_data/covariance_matrix/'), breakpoints1, begin, end) out = metric.calc_metric() print(out) print(out['sum']/out['N_zero']) breakpoints2 = [i for i in range(begin, end+1, int((end-begin)/(len(breakpoints1)-1)))] metric = Metric('chr21', cnst.return_conf('/nethome/jkpickrell/1kG_data/covariance_matrix/'), breakpoints2, begin, end) out = metric.calc_metric() print(out) print(out['sum']/out['N_zero']) flat.print_log_msg('Done')
def run_local_search_single(chr_name, breakpoint_loci, locus_index, start, stop, total_sum, total_N, input_config, metric_out): print("---- Running local search single") try: print("----", start, stop, locus_index, breakpoint_loci, total_sum, total_N) local_search_run = local_search.LocalSearch(chr_name, start, stop, locus_index, breakpoint_loci, total_sum, total_N, input_config) new_breakpoint, new_metric = local_search_run.search() print_breakpoint_comparison(new_breakpoint, new_metric, breakpoint_loci[locus_index], metric_out) return new_breakpoint, new_metric except Exception as e: flat.print_log_msg('Error!') flat.print_log_msg(str(e)) flat.print_log_msg('start: ' + repr(start)) flat.print_log_msg('stop: ' + repr(stop)) # flat.print_log_msg('local_search.__dict__: '+repr(local_search.__dict__)) flat.print_log_msg('Continuing...') return breakpoint_loci[locus_index], None
def custom_binary_search_with_trackback(np_init_array, f, srch_val, trackback_delta=200, trackback_step=20, init_search_location=1000): flat.print_log_msg('Starting custom_binary_search_with_trackback') # One-sided binary (i.e., exponential) search first "apply f to np_init_array and check if init_search_location is smaller than srch_val" "if not, double init search val and try again" print('search_val: ', srch_val) end_v = find_end(np_init_array, f, init_search_location, srch_val) print('end_v: ', end_v) wrapper = FlexibleBoundedAccessor(np_init_array, f, 0, end_v, True) # Search with deferred detection of equality found_width_raw = binsrch.find_le_ind(wrapper, srch_val) print('found_width_raw: ', found_width_raw) found_width = end_v - found_width_raw print('found_width: ', found_width) # Find any remaining "noisy" minima found_width_trackback_raw = trackback(wrapper, srch_val, found_width_raw, trackback_delta, trackback_step) print("found_width_trackback_raw", found_width_trackback_raw) found_width_trackback = end_v - found_width_trackback_raw # Final result found_width = found_width_trackback print('found_width final: ', found_width) return found_width
def pipeline_lean(dataset_path, name, out_fname, begin=-1, end=-1, img='no', orient='diag', red='sum', dataset_name='NONAME'): ''' pipeline_lean(dataset_path, name, begin=-1, end=-1, img='no', orient='diag', red='sum') ''' # analysis = matrix_to_vector.MatrixAnalysis(name, cnst.const[dataset], begin, end) analysis = matrix_to_vector.MatrixAnalysis(name, cnst.return_conf(dataset_path), begin, end) print(analysis.snp_first) print(analysis.snp_last) t = datetime.datetime.now() t_formatted = t.strftime('%Y_%m_%d_%H_%M_%S') # out_fname = 'vector-'+dataset_name+'-'+name+'-'+str(analysis.snp_first)+'-'+str(analysis.snp_last)+'-'+orient+'-'+red+'-img_'+img+'-'+t_formatted # out_fname += '.txt.gz' flat.print_log_msg('out_fname: ' + out_fname) if (img == 'yes'): generate_img = True elif (img == 'no'): generate_img = False else: raise Exception('Error: Unknown argument: ' + img) if (orient == 'vert'): analysis.calc_vert(not generate_img) elif (orient == 'diag'): analysis.calc_diag_lean(out_fname, cnst.const['out_delim'], not generate_img) else: raise Exception('Error: Unknown argument: ' + orient) if (red == 'avg'): avg = True raise Exception( 'Average used, but its output is not always consistent - especially for diag!' ) elif (red == 'sum'): avg = False else: raise Exception('Error: Unknown argument: ' + red) # Output is done step-by-step # analysis.write_output_to_file(out_fname+'.txt.gz', cnst.const['out_delim'], avg) if generate_img: analysis.generate_img(out_fname + cnst.const['img_out_ext']) flat.print_log_msg('Done')
def standard_run(np_init_array, np_init_array_x, start, stop, step): # Interactive plots graphs = filt.apply_filters(np_init_array, start, stop, step) # graphs = filt.apply_filters(np_init_array, int(sys.argv[1]), int(sys.argv[2])+1, int(sys.argv[3])) for g in graphs: flat.print_log_msg('indices' + repr(g['width']) + repr(g['filtered_minima_ind'])) loci = filt.get_minima_loc(g, np_init_array_x) flat.print_log_msg('loci' + repr(g['width']) + repr(loci)) plot_all(np_init_array, graphs, np_init_array_x)
def print_breakpoint_comparison(breakpoint1, metric1, breakpoint2, metric2): flat.print_log_msg('Breakpoint 1: ' + repr(breakpoint1)) flat.print_log_msg('Metric 1:') print_metric(metric1) flat.print_log_msg('Breakpoint 2: ' + repr(breakpoint2)) flat.print_log_msg('Metric 2:') print_metric(metric2)
def main(): ''' Run with at least: a) 1 argument, defining the central filter width of analysis area b) 3 arguments, defining the start, stop, and step of series of filter widths optional last argument is file name of data to read into memory (if it hasn't alreay been read) ''' if sys.argv[1] == 'help': print(main.__doc__) return # max_w = 10000 # vals = [] # for width in range(1, max_w): # vals.append(filt.apply_filter_get_minima(np_init_array, width)) # print(width) # np_temp_array = np.array(vals) # minima = sig.argrelextrema(np_temp_array, np.greater)[0] # print(minima) global np_init_array, np_init_array_x try: np_init_array np_init_array_x except NameError: flat.print_log_msg('Reading data') np_init_array, np_init_array_x = rd.read_data(sys.argv[len(sys.argv) - 1]) relative_width = 0.5 if len(sys.argv) > 3: start, stop, step = int(sys.argv[1]), int(sys.argv[2]), int( sys.argv[3]) else: center_val = int(sys.argv[1]) start, stop, step = math.floor( center_val - relative_width * center_val), math.ceil(center_val + relative_width * center_val), math.floor( 2 * relative_width * center_val / 6) standard_run(np_init_array, np_init_array_x, start, stop, step) flat.print_log_msg('Done')
def write_output_to_file(self, filename, out_delim, avg=False): if not self.calculation_complete: raise Exception( 'Error: Calculation has not been completed prior to output file generation' ) flat.print_log_msg('Writing output to file') if avg: flat.write_output(filename, self.locus_list, self.locus_list_deleted, self.vert_sum, out_delim, self.vert_sum_len) else: flat.write_output(filename, self.locus_list, self.locus_list_deleted, self.vert_sum, out_delim)
def run_local_search_complete(chr_name, breakpoint_loci, begin, end, input_config, metric_out): breakpoint_loci_local_search = {} breakpoint_loci_local_search['loci'] = [] breakpoint_loci_local_search['metrics'] = [] total_sum = metric_out['sum'] total_N = metric_out['N_zero'] # Search between begin and first midpoint b_stop = int(midpoint(breakpoint_loci[0], breakpoint_loci[1])) #-1 # -1 so as to not overlap with next region! -> this is taken care of in local search new_breakpoint, new_metric = run_local_search_single(chr_name, breakpoint_loci, 0, begin, b_stop, total_sum, total_N, input_config, metric_out) breakpoint_loci_local_search['loci'].append(new_breakpoint) breakpoint_loci_local_search['metrics'].append(new_metric) for locus_index in range(1, len(breakpoint_loci)-1): b_start = int(midpoint(breakpoint_loci[locus_index-1], breakpoint_loci[locus_index])) b_stop = int(midpoint(breakpoint_loci[locus_index], breakpoint_loci[locus_index+1])) #-1 # -1 so as to not overlap with next region! -> this is taken care of in local search new_breakpoint, new_metric = run_local_search_single(chr_name, breakpoint_loci, locus_index, b_start, b_stop, total_sum, total_N, input_config, metric_out) breakpoint_loci_local_search['loci'].append(new_breakpoint) breakpoint_loci_local_search['metrics'].append(new_metric) # local_search_run = local_search.LocalSearch(chr_name, breakpoint_loci[locus_index-1], breakpoint_loci[locus_index+1], locus_index, breakpoint_loci, total_sum, total_N, input_config) # # new_breakpoint, new_metric = local_search_run.search() # # print_breakpoint_comparison(new_breakpoint, new_metric, breakpoint_loci[locus_index], metric_out) # # print(new_breakpoint, new_metric['sum']/new_metric['N_zero']) # # print(breakpoint_loci[locus_index], total_sum/total_N) # # breakpoint_loci_local_search['loci'].append(new_breakpoint) # breakpoint_loci_local_search['metrics'].append(new_metric) # Search between last midpoint and end b_start = int(midpoint(breakpoint_loci[len(breakpoint_loci)-2], breakpoint_loci[len(breakpoint_loci)-1])) new_breakpoint, new_metric = run_local_search_single(chr_name, breakpoint_loci, len(breakpoint_loci)-1, b_start, end, total_sum, total_N, input_config, metric_out) breakpoint_loci_local_search['loci'].append(new_breakpoint) breakpoint_loci_local_search['metrics'].append(new_metric) flat.print_log_msg('New breakpoints:') print(breakpoint_loci_local_search) return breakpoint_loci_local_search
def main(): name = 'chr1' flat.print_log_msg('Starting run') x, y, pairs = flat.read_hotspots(cnst.const['genetic_maps']['root']+cnst.const['genetic_maps']['file_base']+name+cnst.const['genetic_maps']['ext']) flat.print_log_msg('Plotting') pt.plot(x,y) fig = pt.gcf() fig.set_size_inches((40,30)) pt.xlabel('SNP #') pt.ylabel('Hotspot val') pt.title('Hotspots') pt.savefig('genetic_maps_output.png') pt.clf() pt.plot(x[5000:10000],y[5000:10000]) fig = pt.gcf() fig.set_size_inches((40,30)) pt.xlabel('SNP #') pt.ylabel('Hotspot val') pt.title('Hotspots zoomed') pt.savefig('genetic_maps_zoomed_output.png') flat.print_log_msg('Done')
def calc_diag_lean(self, out_fname, out_delim, dynamic_delete=True): # flat.print_log_msg('Removing existing matrix output file') # try: # os.remove(cnst.const['out_matrix_delim']) # except OSError: # pass if dynamic_delete == False: raise Exception( 'Error: Conversion has been run in lean mode, but with dynamically=False.' ) self.dynamic_delete = dynamic_delete flat.print_log_msg('Start') # pre-read all relevant partitions at beginning! last_p_num = -1 for p_num_init in range(0, len(self.partitions) - 1): if self.snp_first >= self.partitions[p_num_init + 1][0]: flat.print_log_msg('Pre-reading partition: ' + str(self.partitions[p_num_init])) flat.read_partition_into_matrix_lean( self.partitions, p_num_init, self.matrix, self.locus_list, self.name, self.input_config, self.snp_first, self.snp_last) last_p_num = p_num_init else: break curr_locus = -1 # for p_num, p in enumerate(self.partitions): for p_num in range(last_p_num + 1, len(self.partitions)): p = self.partitions[p_num] flat.print_log_msg('Reading partition: ' + str(p)) flat.read_partition_into_matrix_lean(self.partitions, p_num, self.matrix, self.locus_list, self.name, self.input_config, self.snp_first, self.snp_last) # print("self.partitions", self.partitions[:5]) # print("self.locus_list", self.locus_list[:5]) # print("self.matrix", list(self.matrix.items())[:1]) # print("self.snp_first", self.snp_first) # raise # Determine first locus if curr_locus < 0: # Either first partition or not found in first partition # curr_locus = -1 # <- this should have been set to -1 before entering the main for loop if len(self.locus_list) > 0: # Find first locus >= snp_first for i, locus in enumerate(self.locus_list): if locus >= self.snp_first: curr_locus = locus start_locus = locus curr_locus_index = i start_locus_index = i break else: raise Exception('Error: locus_list seems to be empty') # else: # if len(self.locus_list)>0: # curr_locus = self.locus_list[0] # curr_locus_index = 0 # else: # raise Exception('Error: locus_list seems to be empty') else: try: curr_locus_index = self.locus_list.index(curr_locus) # curr_locus is carried from prev iteration, but index has changed since part of matrix (and locus_list) has been deleted except ValueError: if len(self.locus_list) > 0: curr_locus = self.locus_list[0] curr_locus_index = 0 else: raise Exception('Error: locus_list seems to be empty') if curr_locus < 0: flat.print_log_msg( 'Warning: curr_locus not found! Continuing to next partition.' ) flat.print_log_msg( 'Comment: This is possibly due to snp_first being very close to end of partition.' ) flat.print_log_msg('Details: ') flat.print_log_msg('Partition: ' + repr(p)) flat.print_log_msg('snp_first: ' + repr(self.snp_first)) flat.print_log_msg('curr_locus: ' + repr(curr_locus)) continue #continue to next partition # raise Exception('Error: curr_locus not found!') # Determine end locus if p_num + 1 < len(self.partitions): end_locus = int( (self.partitions[p_num][1] + self.partitions[p_num + 1][0]) / 2) # diag - specific print("1 end locus", end_locus, p_num) else: # end_locus = self.partitions[p_num][1] # Find last locus <= snp_last end_locus_found = False for i in reversed(range(0, len(self.locus_list))): # for locus in reversed(locus_list): if self.locus_list[i] <= self.snp_last: end_locus = self.locus_list[i] end_locus_index = i end_locus_found = True break print("2 end locus", end_locus, p_num) if not end_locus_found: end_locus_index = 0 end_locus = self.locus_list[end_locus_index] flat.print_log_msg('Running for partition: ' + str(p)) # This will not include the very last SNP of the complete range, but that shouldn't be too important since the end of the range shouldn't be a defining location for LD while curr_locus <= end_locus: print("-----" * 5) print("curr_locus", curr_locus) total_iterations = 0 total_additions = 0 x = self.locus_list[curr_locus_index] y = self.locus_list[curr_locus_index] print("x_idx", curr_locus_index) print("y_idx", curr_locus_index) print("x", x) print("y", y) delta = 0 while x >= self.partitions[p_num][0] and y <= self.partitions[ p_num][1]: print(" x", x) print(" y", y) # print(" delta", delta) # when would x not be in matrix or y not be in matrix[x]? if x in self.matrix and y in self.matrix[x]: # print("computing corr coeff for", x, y) corr_coeff = self.matrix[x][y] / math.sqrt( self.matrix[x][x] * self.matrix[y][y]) self.add_corr_coeff(corr_coeff, curr_locus) print(" self.vert_sum[curr_locus]", self.vert_sum[curr_locus]) total_additions += 1 # Just save it in the matrix ;) - removed for chrom11 # self.matrix[x]['data'][y]['corr_coeff'] = corr_coeff # else: # flat.print_log_msg('Condition not satisfied 1!') # flat.print_log_msg('x: '+repr(x)+' y: '+repr(y)) if delta != 0: x = self.locus_list[curr_locus_index - delta + 1] if x in self.matrix and y in self.matrix[x]: # print("computing corr coeff for", x, y) corr_coeff = self.matrix[x][y] / math.sqrt( self.matrix[x][x] * self.matrix[y][y]) self.add_corr_coeff(corr_coeff, curr_locus) print(" self.vert_sum[curr_locus]", self.vert_sum[curr_locus]) total_additions += 1 # Just save it in the matrix ;) - removed for chrom11 # self.matrix[x]['data'][y]['corr_coeff'] = corr_coeff # else: # flat.print_log_msg('Condition not satisfied 2!') # flat.print_log_msg('x: '+repr(x)+' y: '+repr(y)) delta += 1 if curr_locus_index - delta >= 0: print("x_idx", curr_locus_index - delta) x = self.locus_list[curr_locus_index - delta] else: # flat.print_log_msg('X index out of bounds') flat.print_log_msg('X index out of bounds') break if curr_locus_index + delta < len(self.locus_list): print("y_idx", curr_locus_index + delta) y = self.locus_list[curr_locus_index + delta] else: flat.print_log_msg('Y index out of bounds') break total_iterations += 1 print("total_iterations", total_iterations) print("total_additions", total_additions) if curr_locus_index + 1 < len(self.locus_list): curr_locus_index += 1 curr_locus = self.locus_list[curr_locus_index] else: flat.print_log_msg('curr_locus_index out of bounds') break # flat.print_log_msg('Mem before delete: '+repr(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)) # flat.delete_loci_smaller_than_and_output_matrix_to_file(end_locus, self.matrix, locus_list, locus_list_deleted, cnst.const['out_matrix_filename']) if self.dynamic_delete: flat.print_log_msg('Deleting loci not required any more') if p_num + 1 < len(self.partitions): delete_loc = self.partitions[p_num + 1][0] else: delete_loc = end_locus flat.delete_loci_smaller_than_lean(delete_loc, self.matrix, self.locus_list, self.locus_list_deleted, out_fname, self.vert_sum, out_delim) else: flat.print_log_msg('locus_list size: ' + repr(len(self.locus_list))) # flat.print_log_msg('Mem after delete: '+repr(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)) self.start_locus = start_locus self.start_locus_index = start_locus_index self.end_locus = end_locus self.end_locus_index = end_locus_index self.calculation_complete = True
def pipeline(dataset_path, name, out_fname, begin=-1, end=-1, img='no', orient='diag', red='sum', snp=None, comment='', dataset_name='NONAME'): ''' pipeline(dataset_path, name, begin=-1, end=-1, img='no', orient='diag', red='sum', snp=None, comment='') snp1 and snp2 are loci of two SNPs that need to be converted into ordinal numbers representing row/col in image of matrix ''' # analysis = matrix_to_vector.MatrixAnalysis(name, cnst.const[dataset], begin, end) analysis = matrix_to_vector.MatrixAnalysis(name, cnst.return_conf(dataset_path), begin, end) print(analysis.snp_first) print(analysis.snp_last) if (img == 'yes'): generate_img = True elif (img == 'no'): generate_img = False else: raise Exception('Error: Unknown argument: ' + img) if (orient == 'vert'): analysis.calc_vert(not generate_img) elif (orient == 'diag'): analysis.calc_diag(not generate_img) else: raise Exception('Error: Unknown argument: ' + orient) if (red == 'avg'): avg = True raise Exception( 'Average used, but its output is not always consistent - especially for diag!' ) elif (red == 'sum'): avg = False else: raise Exception('Error: Unknown argument: ' + red) t = datetime.datetime.now() t_formatted = t.strftime('%Y_%m_%d_%H_%M_%S') # out_fname = 'vector-'+dataset_name+'-'+name+'-'+comment+'-'+str(analysis.snp_first)+'-'+str(analysis.snp_last)+'-'+orient+'-'+red+'-img_'+img+'-'+t_formatted analysis.write_output_to_file(out_fname, cnst.const['out_delim'], avg) if generate_img: # flat.print_log_msg('x_values: '+repr(x_values)) if snp is not None: analysis.generate_img( 'img-' + out_fname + cnst.const['img_out_ext'], snp) else: analysis.generate_img('img-' + out_fname + cnst.const['img_out_ext']) flat.print_log_msg('Done')
def calc_metric_full(self): # flat.print_log_msg('Removing existing matrix output file') # try: # os.remove(cnst.const['out_matrix_delim']) # except OSError: # pass if not self.dynamic_delete: raise Exception('Error: dynamic delete must be True for metric calculation!') flat.print_log_msg('Start metric') curr_breakpoint_index = 0 block_height = 0 block_width = 0 total_N_SNPs = decimal.Decimal('0') block_width_sum = decimal.Decimal('0') # pre-read all relevant partitions at beginning! last_p_num = -1 for p_num_init in range(0, len(self.partitions)-1): if self.snp_first >= self.partitions[p_num_init+1][0]: flat.print_log_msg('Pre-reading partition: '+str(self.partitions[p_num_init])) flat.read_partition_into_matrix(self.partitions, p_num_init, self.matrix, self.locus_list, self.name, self.input_config, self.snp_first, self.snp_last) last_p_num = p_num_init else: break curr_locus = -1 # for p_num, p in enumerate(self.partitions): for p_num in range(last_p_num+1, len(self.partitions)): p = self.partitions[p_num] flat.print_log_msg('Reading partition: '+str(p)) flat.read_partition_into_matrix(self.partitions, p_num, self.matrix, self.locus_list, self.name, self.input_config, self.snp_first, self.snp_last) # Determine first locus if curr_locus<0: # Either first partition or not found in first partition # curr_locus = -1 # <- this should have been set to -1 before entering the main for loop if len(self.locus_list)>0: # Find first locus >= snp_first for i, locus in enumerate(self.locus_list): if locus >= self.snp_first: curr_locus = locus start_locus = locus curr_locus_index = i start_locus_index = i break else: raise Exception('Error: locus_list seems to be empty') # else: # if len(self.locus_list)>0: # curr_locus = self.locus_list[0] # curr_locus_index = 0 # else: # raise Exception('Error: locus_list seems to be empty') else: try: curr_locus_index = self.locus_list.index(curr_locus) # curr_locus is carried from prev iteration, but index has changed since part of matrix (and locus_list) has been deleted except ValueError: if len(self.locus_list)>0: curr_locus = self.locus_list[0] curr_locus_index = 0 else: raise Exception('Error: locus_list seems to be empty') if curr_locus<0: flat.print_log_msg('Warning: curr_locus not found! Continuing to next partition.') flat.print_log_msg('Comment: This is possibly due to snp_first being very close to end of partition.') flat.print_log_msg('Details: ') flat.print_log_msg('Partition: '+repr(p)) flat.print_log_msg('snp_first: '+repr(self.snp_first)) flat.print_log_msg('curr_locus: '+repr(curr_locus)) continue #continue to next partition # raise Exception('Error: curr_locus not found!') # Determine last locus if p_num+1 < len(self.partitions): end_locus = self.partitions[p_num+1][0] end_locus_index = -1 else: # end_locus = self.partitions[p_num][1] # Find last locus <= snp_last end_locus_found = False for i in reversed(range(0, len(self.locus_list))): # for locus in reversed(locus_list): if self.locus_list[i] <= self.snp_last: end_locus = self.locus_list[i] end_locus_index = i end_locus_found = True break if not end_locus_found: end_locus_index = 0 end_locus = self.locus_list[end_locus_index] flat.print_log_msg('Running metric for partition: '+str(p)) # This will not include the very last SNP of the complete range, but that shouldn't be too important since the end of the range shouldn't be a defining location for LD while curr_locus <= end_locus: if curr_breakpoint_index<len(self.breakpoints): if curr_locus > self.breakpoints[curr_breakpoint_index]: # Breakpoint is the last element of the block! # block_height = len(self.locus_list) - curr_locus_index block_height = 0 - total_N_SNPs # - 1 # ? # this is in accordance with the formula for deferred sum calculation self.metric['N_zero'] += block_height * block_width block_width_sum += block_width curr_breakpoint_index += 1 block_width = 0 if curr_breakpoint_index>=len(self.breakpoints): break # found = False try: for key, el in self.matrix[curr_locus]['data'].items(): if key > self.breakpoints[curr_breakpoint_index]: # Only add those above the breakpoint! corr_coeff = self.matrix[curr_locus]['data'][key]['shrink'] / math.sqrt( self.matrix[curr_locus]['data'][curr_locus]['shrink'] * self.matrix[key]['data'][key]['shrink'] ) self.metric['sum'] += decimal.Decimal(corr_coeff**2) self.metric['N_nonzero'] += 1 # found = True except IndexError as e: print('Error!') print(e) print(key, el) print(curr_locus) print(self.matrix) print(self.breakpoints) print(curr_breakpoint_index) # if found: block_width += 1 # block_width needs to be increased even if it doesn't have values in the outer part of the matrix! if curr_locus_index+1 < len(self.locus_list): curr_locus_index+=1 curr_locus = self.locus_list[curr_locus_index] total_N_SNPs += 1 else: flat.print_log_msg('curr_locus_index out of bounds') break # if block_width > 0: # If an LD block hasn't finished, but a new partition must be read into memory # # index_of_breakpoint_in_locus_list = -1 # for ind in range(curr_locus_index, len(self.locus_list)): # if self.locus_list[ind] >= self.breakpoints[curr_breakpoint_index]: # # index_of_breakpoint_in_locus_list = ind # break # # num_of_SNPs_to_add = ind - curr_locus_index # # # if index_of_breakpoint_in_locus_list < 0: # # raise Exception('Error: index_of_breakpoint_in_locus_list not found!') # # # block_height = len(self.locus_list) - index_of_breakpoint_in_locus_list # block_height = 0 - (total_N_SNPs+num_of_SNPs_to_add) # self.metric['N_zero'] += block_height * block_width # # block_width_sum += block_width # block_width = 0 # flat.delete_loci_smaller_than_and_output_matrix_to_file(end_locus, self.matrix, locus_list, locus_list_deleted, cnst.const['out_matrix_filename']) if self.dynamic_delete: flat.print_log_msg('Deleting loci not required any more') flat.delete_loci_smaller_than(end_locus, self.matrix, self.locus_list, self.locus_list_deleted) self.start_locus = start_locus self.start_locus_index = start_locus_index self.end_locus = end_locus self.end_locus_index = end_locus_index self.metric['N_zero'] += total_N_SNPs * block_width_sum # this is in accordance with the formula for deferred sum calculation print('total_N_SNPs, block_width', total_N_SNPs, block_width) print('total_N_SNPs-block_width', total_N_SNPs-block_width) print('block_width_sum', block_width_sum) self.calculation_complete = True return self.metric
def pipeline(input_fname, chr_name, dataset_path, n_snps_bw_bpoints, out_fname, begin=-1, end=-1, trackback_delta=200, trackback_step=20, init_search_location=1000): # print("n_snps_bw_bpoints", n_snps_bw_bpoints) # print("trackback_delta", trackback_delta) # print("trackback_step", trackback_step) config = cnst.return_conf(dataset_path) # begin, end = flat.first_last(chr_name, cnst.const[dataset], begin, end) "just reads first and last position in partitions" begin, end = flat.first_last(chr_name, config, begin, end) # READ DATA flat.print_log_msg('* Reading data') "just reads into snp pos and val into first and second list" init_array, init_array_x = rd.read_data_raw(input_fname) # print(init_array) # print(init_array_x) # Clip the input data to the required range and convert to numpy array "just a bisect left and bisect right" begin_ind = binsrch.find_ge_ind(init_array_x, begin) # = init_array_x.index(begin) end_ind = binsrch.find_le_ind(init_array_x, end) # = init_array_x.index(end) # # print("len before", len(init_array_x)) np_init_array = np.array(init_array[begin_ind:(end_ind + 1)]) np_init_array_x = np.array(init_array_x[begin_ind:(end_ind + 1)]) # print("len after", len(np_init_array_x)) # DETERMINE NUMBER OF BREAKPOINTS n_bpoints = int(math.ceil(len(np_init_array_x) / n_snps_bw_bpoints - 1)) # flat.print_log_msg('* Number of breakpoints: '+repr(n_bpoints)) # print("hiya") # result = [filt.apply_filter_get_minima(np_init_array, width) for width in range(0, 1000)] # print(result) # raise # SEARCH FOR FILTER WIDTH # flat.print_log_msg('* Starting search...') found_width = find_minima.custom_binary_search_with_trackback( np_init_array, filt.apply_filter_get_minima, n_bpoints, trackback_delta=trackback_delta, trackback_step=trackback_step, init_search_location=init_search_location) # flat.print_log_msg('* Found_width: ' + repr(found_width)) # GET MINIMA LOCATIONS flat.print_log_msg('* Applying filter and getting minima locations...') "just applies hanning to init_array" g = filt.apply_filter(np_init_array, found_width) # print("raise", g) # print("raise", np_init_array) # print("raise", np_init_array_x) breakpoint_loci = filt.get_minima_loc(g, np_init_array_x) # print("raise", breakpoint_loci) # raise # METRIC # flat.print_log_msg('* Calculating metric for non-uniform breakpoints (minima of filtered data)...') # metric_out = apply_metric(chr_name, begin, end, cnst.const[dataset], breakpoint_loci) metric_out = apply_metric(chr_name, begin, end, config, breakpoint_loci) # flat.print_log_msg('Global metric:') print("raise", metric_out) raise # print_metric(metric_out) # METRIC FOR UNIFORM BREAKPOINTS # flat.print_log_msg('* Calculating metric for uniform breakpoints...') # # step = int((end-begin)/(len(breakpoint_loci)+1)) # # breakpoint_loci_uniform = [l for l in range(begin+step, end-step+1, step)] # step = int(len(init_array_x)/(len(breakpoint_loci)+1)) # breakpoint_loci_uniform = [init_array_x[i] for i in range(step, len(init_array_x)-step+1, step)] # # metric_out_uniform = apply_metric(chr_name, begin, end, cnst.const[dataset], breakpoint_loci_uniform) # metric_out_uniform = apply_metric(chr_name, begin, end, config, breakpoint_loci_uniform) # flat.print_log_msg('Global metric:') # print_metric(metric_out_uniform) # LOCAL SEARCH ON FOURIER - missing N runs flat.print_log_msg('* Running local search for fourier...') # breakpoint_loci_local_search = run_local_search_complete(chr_name, breakpoint_loci, begin, end, cnst.const[dataset], metric_out) breakpoint_loci_local_search = run_local_search_complete( chr_name, breakpoint_loci, begin, end, config, metric_out) print(breakpoint_loci_local_search) raise # RUN METRIC AGAIN W/ NEW BREAKPOINTS FROM FOURIER LOCAL SEARCH flat.print_log_msg('* Calculating metric for new fourier breakpoints...') # metric_out_local_search = apply_metric(chr_name, begin, end, cnst.const[dataset], breakpoint_loci_local_search['loci']) metric_out_local_search = apply_metric( chr_name, begin, end, config, breakpoint_loci_local_search['loci']) flat.print_log_msg('Global metric:') print_metric(metric_out_local_search) # LOCAL SEARCH ON UNIFORM - missing N runs flat.print_log_msg('* Running local search for uniform breakpoints...') # breakpoint_loci_uniform_local_search = run_local_search_complete(chr_name, breakpoint_loci_uniform, begin, end, cnst.const[dataset], metric_out_uniform) breakpoint_loci_uniform_local_search = run_local_search_complete( chr_name, breakpoint_loci_uniform, begin, end, config, metric_out_uniform) # RUN METRIC AGAIN W/ NEW BREAKPOINTS FROM UNIFORM flat.print_log_msg('* Calculating metric for new uniform breakpoints...') # metric_out_uniform_local_search = apply_metric(chr_name, begin, end, cnst.const[dataset], breakpoint_loci_uniform_local_search['loci']) metric_out_uniform_local_search = apply_metric( chr_name, begin, end, config, breakpoint_loci_uniform_local_search['loci']) flat.print_log_msg('Global metric:') print_metric(metric_out_uniform_local_search) # DUMP DATA INTO PICKLE SO IT CAN BE ANALYZED AND LOOKED AT WITHOUT RE-RUNNING EVERYTHING pickle_out = {} pickle_out['argv'] = sys.argv pickle_out['n_bpoints'] = n_bpoints pickle_out['found_width'] = found_width pickle_out['fourier'] = {} pickle_out['fourier']['loci'] = breakpoint_loci pickle_out['fourier']['metric'] = metric_out pickle_out['uniform'] = {} pickle_out['uniform']['loci'] = breakpoint_loci_uniform pickle_out['uniform']['metric'] = metric_out_uniform pickle_out[ 'fourier_ls'] = breakpoint_loci_local_search # Yes, breakpoint_loci_local_search is already a dict with 'loci' and 'metrics' keys pickle_out['fourier_ls']['metric'] = metric_out_local_search pickle_out['uniform_ls'] = breakpoint_loci_uniform_local_search pickle_out['uniform_ls']['metric'] = metric_out_uniform_local_search t = datetime.datetime.now() t_formatted = t.strftime('%Y_%m_%d_%H_%M_%S') # pickle_dump_fname = 'pickle-'+dataset+'-'+chr_name+'-'+str(n_bpoints)+'-'+str(begin)+'-'+str(end)+'-'+t_formatted+'.pickle' with open(out_fname, 'wb') as f_out: pickle.dump(pickle_out, f_out) flat.print_log_msg('Done')
def calc_diag(self, dynamic_delete=True): # flat.print_log_msg('Removing existing matrix output file') # try: # os.remove(cnst.const['out_matrix_delim']) # except OSError: # pass self.dynamic_delete = dynamic_delete flat.print_log_msg('Start') # pre-read all relevant partitions at beginning! last_p_num = -1 for p_num_init in range(0, len(self.partitions) - 1): if self.snp_first >= self.partitions[p_num_init + 1][0]: flat.print_log_msg('Pre-reading partition: ' + str(self.partitions[p_num_init])) flat.read_partition_into_matrix(self.partitions, p_num_init, self.matrix, self.locus_list, self.name, self.input_config, self.snp_first, self.snp_last) last_p_num = p_num_init else: break curr_locus = -1 # for p_num, p in enumerate(self.partitions): for p_num in range(last_p_num + 1, len(self.partitions)): p = self.partitions[p_num] flat.print_log_msg('Reading partition: ' + str(p)) flat.read_partition_into_matrix(self.partitions, p_num, self.matrix, self.locus_list, self.name, self.input_config, self.snp_first, self.snp_last) # Determine first locus if curr_locus < 0: # Either first partition or not found in first partition # curr_locus = -1 # <- this should have been set to -1 before entering the main for loop if len(self.locus_list) > 0: # Find first locus >= snp_first for i, locus in enumerate(self.locus_list): if locus >= self.snp_first: curr_locus = locus start_locus = locus curr_locus_index = i start_locus_index = i break else: raise Exception('Error: locus_list seems to be empty') # else: # if len(self.locus_list)>0: # curr_locus = self.locus_list[0] # curr_locus_index = 0 # else: # raise Exception('Error: locus_list seems to be empty') else: try: curr_locus_index = self.locus_list.index(curr_locus) # curr_locus is carried from prev iteration, but index has changed since part of matrix (and locus_list) has been deleted except ValueError: if len(self.locus_list) > 0: curr_locus = self.locus_list[0] curr_locus_index = 0 else: raise Exception('Error: locus_list seems to be empty') if curr_locus < 0: flat.print_log_msg( 'Warning: curr_locus not found! Continuing to next partition.' ) flat.print_log_msg( 'Comment: This is possibly due to snp_first being very close to end of partition.' ) flat.print_log_msg('Details: ') flat.print_log_msg('Partition: ' + repr(p)) flat.print_log_msg('snp_first: ' + repr(self.snp_first)) flat.print_log_msg('curr_locus: ' + repr(curr_locus)) continue #continue to next partition # raise Exception('Error: curr_locus not found!') # Determine end locus if p_num + 1 < len(self.partitions): end_locus = int( (self.partitions[p_num][1] + self.partitions[p_num + 1][0]) / 2) else: # end_locus = self.partitions[p_num][1] # Find last locus <= snp_last end_locus_found = False for i in reversed(range(0, len(self.locus_list))): # for locus in reversed(locus_list): if self.locus_list[i] <= self.snp_last: end_locus = self.locus_list[i] end_locus_index = i end_locus_found = True break if not end_locus_found: end_locus_index = 0 end_locus = self.locus_list[end_locus_index] flat.print_log_msg('Running for partition: ' + str(p)) # This will not include the very last SNP of the complete range, but that shouldn't be too important since the end of the range shouldn't be a defining location for LD while curr_locus <= end_locus: x = self.locus_list[curr_locus_index] y = self.locus_list[curr_locus_index] delta = 0 while x >= self.partitions[p_num][0] and y <= self.partitions[ p_num][1]: if x in self.matrix and y in self.matrix[x]['data']: corr_coeff = self.matrix[x]['data'][y][ 'shrink'] / math.sqrt( self.matrix[x]['data'][x]['shrink'] * self.matrix[y]['data'][y]['shrink']) self.add_corr_coeff(corr_coeff, curr_locus) # Just save it in the matrix ;) ...for img self.matrix[x]['data'][y]['corr_coeff'] = corr_coeff if delta != 0: x = self.locus_list[curr_locus_index - delta + 1] if x in self.matrix and y in self.matrix[x]['data']: corr_coeff = self.matrix[x]['data'][y][ 'shrink'] / math.sqrt( self.matrix[x]['data'][x]['shrink'] * self.matrix[y]['data'][y]['shrink']) self.add_corr_coeff(corr_coeff, curr_locus) # Just save it in the matrix ;) ...for img self.matrix[x]['data'][y][ 'corr_coeff'] = corr_coeff delta += 1 if curr_locus_index - delta >= 0: x = self.locus_list[curr_locus_index - delta] else: # flat.print_log_msg('X index out of bounds') break if curr_locus_index + delta < len(self.locus_list): y = self.locus_list[curr_locus_index + delta] else: # flat.print_log_msg('Y index out of bounds') break if curr_locus_index + 1 < len(self.locus_list): curr_locus_index += 1 curr_locus = self.locus_list[curr_locus_index] else: flat.print_log_msg('curr_locus_index out of bounds') break # flat.delete_loci_smaller_than_and_output_matrix_to_file(end_locus, self.matrix, locus_list, locus_list_deleted, cnst.const['out_matrix_filename']) if self.dynamic_delete: flat.print_log_msg('Deleting loci not required any more') if p_num + 1 < len(self.partitions): delete_loc = self.partitions[p_num + 1][0] # diag - specific else: delete_loc = end_locus flat.delete_loci_smaller_than(delete_loc, self.matrix, self.locus_list, self.locus_list_deleted) else: flat.print_log_msg('locus_list size: ' + repr(len(self.locus_list))) self.start_locus = start_locus self.start_locus_index = start_locus_index self.end_locus = end_locus self.end_locus_index = end_locus_index self.calculation_complete = True
def generate_img(self, img_full_path, marked_snp=None): import numpy as np import matplotlib as mpl # mpl.use('svg') mpl.use('Agg') import matplotlib.pyplot as pt mpl.rcParams.update({'font.size': 22}) # import svgwrite # if center is None: # first = self.snp_first # last = self.snp_last # else: # if distance_in_snps is None: # raise Exception('Error: center is defined, but distance_in_snps is not!') # else: # first, last = self.query_locus_list(center, distance_in_snps) if not self.calculation_complete: raise Exception( 'Error: Calculation has not been completed prior to image generation' ) if self.dynamic_delete: raise Exception( 'Error: The matrix was dynamically deleted - cannot generate full image!' ) if len(self.matrix) <= 0: raise Exception('Error: The matrix is emmpty or erroneous') flat.print_log_msg('Image init') # svg_document = svgwrite.Drawing(filename = cnst.const['svg_out_fname'], # size = (self.end_locus_index-self.start_locus_index, self.end_locus_index-self.start_locus_index)) # Draw background # svg_document.add(svg_document.rect(insert=(0, 0), size=('100%', '100%'), rx=None, ry=None, fill='rgb(0,0,0)')) plot_mtrx_size = self.end_locus_index - self.start_locus_index + 1 plot_mtrx = [[0 for x in range(plot_mtrx_size)] for x in range(plot_mtrx_size)] flat.print_log_msg('Plot matrix size: ' + str(plot_mtrx_size)) flat.print_log_msg('Matrix size: ' + str(len(self.matrix))) flat.print_log_msg('locus_list size: ' + str(len(self.locus_list))) flat.print_log_msg('locus_list_deleted size: ' + str(len(self.locus_list_deleted))) x_values = [0 for x in range(plot_mtrx_size)] flat.print_log_msg('Generating image data') for loc_i in self.matrix: if loc_i >= self.snp_first and loc_i <= self.snp_last: x_values[self.locus_list.index(loc_i) - self.start_locus_index] = loc_i for loc_j in self.matrix[loc_i]['data']: if loc_j >= self.snp_first and loc_j <= self.snp_last: # if len(svg_loci)<svg_length: if 'corr_coeff' in self.matrix[loc_i]['data'][loc_j]: # color = 255* ( 1- ( self.matrix[loc_i]['data'][loc_j]['corr_coeff'] ** 2 ) ) try: plot_mtrx[self.locus_list.index(loc_i) - self.start_locus_index][ self.locus_list.index(loc_j) - self.start_locus_index] = ( (self.matrix[loc_i]['data'] [loc_j]['corr_coeff'])**2) except IndexError: print( self.locus_list.index(loc_i) - self.start_locus_index) print(len(plot_mtrx)) print( self.locus_list.index(loc_j) - self.start_locus_index) print( len(plot_mtrx[self.locus_list.index(loc_i) - self.start_locus_index])) # svg_document.add(svg_document.rect(insert = (self.locus_list.index(loc_i)-self.start_locus_index, self.locus_list.index(loc_j)-self.start_locus_index), # size = ('1', '1'), # fill = 'rgb(255,'+str(int(color))+','+str(int(color))+')')) # svg_loci.add(curr_locus) else: flat.print_log_msg("No 'corr_coef' key at: " + str(loc_i) + ' ' + str(loc_j)) # raise Exception('WTF') flat.print_log_msg('Writing image file...') fig = pt.gcf() dpi = fig.get_dpi() fig_size = fig.get_size_inches() # pt.pcolor(np.array(plot_mtrx), cmap='Reds', vmin=0, vmax=1) pt.pcolormesh(np.array(plot_mtrx), cmap='binary', vmin=0, vmax=1) pt.colorbar() # x_values = np.array(x_values) # needs to be numpy array for pcolormesh() # X, Y = np.meshgrid(x_values, x_values) # pt.pcolormesh(X, Y, np.array(plot_mtrx), cmap='Reds', vmin=0, vmax=1) if marked_snp is not None: bpoint_loc = x_values.index(marked_snp) pt.scatter((bpoint_loc), (bpoint_loc), marker='x', color='green') flat.print_log_msg('SNP: ' + repr(marked_snp) + ' @ index: ' + repr(bpoint_loc) + ' in graph') fig = pt.gcf() fig.set_size_inches((40, 30)) pt.xlabel('SNP #') pt.ylabel('SNP #') pt.title('Correlation coefficient squared matrix') pt.savefig(img_full_path)
def calc_vert(self, dynamic_delete=True, sum_both_sides=True): # flat.print_log_msg('Removing existing matrix output file') # try: # os.remove(cnst.const['out_matrix_delim']) # except OSError: # pass raise Exception('calc_vert is deprecated - check code before running!') self.dynamic_delete = dynamic_delete flat.print_log_msg('Start') for p_num, p in enumerate(self.partitions): flat.print_log_msg('Reading partition: ' + str(p)) flat.read_partition_into_matrix(self.partitions, p_num, self.matrix, self.locus_list, self.name, self.input_config, self.snp_first, self.snp_last) # Determine first locus curr_locus = -1 if p_num == 0: if len(self.locus_list) > 0: # Find first locus >= snp_first for i, locus in enumerate(self.locus_list): if locus >= self.snp_first: curr_locus = locus start_locus = locus curr_locus_index = i start_locus_index = i break else: raise Exception('Error: locus_list seems to be empty') else: if len(self.locus_list) > 0: curr_locus = self.locus_list[0] curr_locus_index = 0 else: raise Exception('Error: locus_list seems to be empty') if curr_locus < 0: raise Exception('Error: curr_locus not found!') if p_num + 1 < len(self.partitions): end_locus = self.partitions[p_num + 1][0] end_locus_index = -1 else: # end_locus = partitions[p_num][1] # Find last locus <= snp_last for i in reversed(range(0, len(self.locus_list))): # for locus in reversed(locus_list): if self.locus_list[i] <= self.snp_last: end_locus = self.locus_list[i] end_locus_index = i break flat.print_log_msg('Running for partition: ' + str(p)) # This will not include the very last SNP of the complete range, but that shouldn't be too important since the end of the range shouldn't be a defining location for LD while curr_locus < end_locus: for key, el in self.matrix[curr_locus]['data'].items(): corr_coeff = self.matrix[curr_locus]['data'][key][ 'shrink'] / math.sqrt( self.matrix[curr_locus]['data'][curr_locus] ['shrink'] * self.matrix[key]['data'][key]['shrink']) self.add_corr_coeff(corr_coeff, curr_locus) if sum_both_sides: self.add_corr_coeff(corr_coeff, key) # Just save it in the matrix ;) self.matrix[curr_locus]['data'][key][ 'corr_coeff'] = corr_coeff if curr_locus_index + 1 < len(self.locus_list): curr_locus_index += 1 curr_locus = self.locus_list[curr_locus_index] else: flat.print_log_msg('curr_locus_index out of bounds') break # flat.delete_loci_smaller_than_and_output_matrix_to_file(end_locus, self.matrix, locus_list, locus_list_deleted, cnst.const['out_matrix_filename']) if self.dynamic_delete: flat.print_log_msg('Deleting loci not required any more') flat.delete_loci_smaller_than(end_locus, self.matrix, self.locus_list, self.locus_list_deleted) self.start_locus = start_locus self.start_locus_index = start_locus_index self.end_locus = end_locus self.end_locus_index = end_locus_index self.calculation_complete = True
def print_metric(metric_out): flat.print_log_msg('Sum: ' + repr(metric_out['sum'])) flat.print_log_msg('N (w/ zero\'s): ' + repr(metric_out['N_zero'])) flat.print_log_msg('Metric: ' + repr(metric_out['sum'] / metric_out['N_zero']))
def search(self): print("----- Running search") if not self.init_complete: flat.print_log_msg('init_search() must be run before search(). Starting automatically...') self.init_search() flat.print_log_msg('Starting local search...') print("addy", len(self.precomputed['data'])) print("len(locus_list)", len(self.precomputed["locus_list"])) print("locus_list", self.precomputed["locus_list"][:5], self.precomputed["locus_list"][-5:]) # In case the value itself is not in the list: try: print("hiihihih", self.snp_bottom, self.snp_top) snp_bottom_ind = binsrch.find_ge_ind(self.precomputed['locus_list'], self.snp_bottom) snp_top_ind = binsrch.find_le_ind(self.precomputed['locus_list'], self.snp_top) except Exception as e: flat.print_log_msg('Error2!') flat.print_log_msg(repr(e)) flat.print_log_msg('self.precomputed[\'locus_list\']: '+repr(self.precomputed['locus_list'])) flat.print_log_msg('self.snp_bottom: '+repr(self.snp_bottom)) flat.print_log_msg('self.snp_first: '+repr(self.snp_first)) flat.print_log_msg('self.snp_last: '+repr(self.snp_last)) flat.print_log_msg('self.snp_top: '+repr(self.snp_top)) flat.print_log_msg('self.__dict__: '+repr(self.__dict__)) flat.print_log_msg('Continuing...') return self.breakpoints[self.initial_breakpoint_index], None print("self.snp_bottom", self.snp_bottom) #, len(self.precomputed["locus_list"])) print("self.snp_top", self.snp_top) print("self.initial_breakpoint_index", self.initial_breakpoint_index) print("snp_bottom_ind", snp_bottom_ind) print("snp_top_ind", snp_top_ind) # Old: # snp_first_ind = self.precomputed['locus_list'].index(self.snp_first) # This should be snp_bottom # snp_top_ind = self.precomputed['locus_list'].index(self.snp_top) # Start from init breakpoint and search left. Then start from init_breakpoint again and search right. # We start from init_breakpoint because that's the initial sum and N that we have -> so we can use the precomputed data to incrementally check for # Find the closest locus to the breakpoint value, because a breakpoint doesn't necessarily have to be in the locus_list breakpoint_index_in_locus_list = binsrch.find_le_ind(self.precomputed['locus_list'], self.breakpoints[self.initial_breakpoint_index]) # print("breakpoint_index_in_locus_list", breakpoint_index_in_locus_list) # print("breakpoint_index_in_locus_list", self.precomputed["locus_list"]) print("breakpoint_index_in_locus_list", len(self.precomputed["locus_list"])) init_breakpoint_locus = self.precomputed['locus_list'][breakpoint_index_in_locus_list] # Old: # breakpoint_index_in_locus_list = self.precomputed['locus_list'].index(self.breakpoints[self.initial_breakpoint_index]) curr_sum = self.total_sum curr_N = self.total_N print("curr_sum", curr_sum) print("curr_N", curr_N) min_metric = decimal.Decimal(self.total_sum) / decimal.Decimal(self.total_N) min_breakpoint = None min_metric_details = {} min_metric_details['sum'] = self.total_sum min_metric_details['N_zero'] = self.total_N min_distance_right = 0 # because the initial distance of the minimum actually is 0! (until we find a new minima to the RIGHT, or we don't in which case it doesn't matter) # print("pre", self.precomputed['data'][39967768]['sum_horiz'], self.precomputed['data'][39967768]['sum_vert']) # Go RIGHT! flat.print_log_msg('Searching right...') if breakpoint_index_in_locus_list+1 < len(self.precomputed['locus_list']): curr_loc_ind = breakpoint_index_in_locus_list+1 curr_loc = self.precomputed['locus_list'][curr_loc_ind] # counter = 0 # print("self.snp_last", self.snp_last) while curr_loc <= self.snp_last: # print("curr_loc", curr_loc) # print(curr_loc, "curr_sum", curr_sum, self.precomputed['data'][curr_loc]['sum_horiz'], self.precomputed['data'][curr_loc]['sum_vert']) curr_sum = curr_sum - self.precomputed['data'][curr_loc]['sum_horiz'] + self.precomputed['data'][curr_loc]['sum_vert'] # counter += 1 # print("_N curr_loc_ind", curr_loc_ind, snp_top_ind) horiz_N = curr_loc_ind-snp_bottom_ind-1 vert_N = snp_top_ind-curr_loc_ind curr_N = curr_N - horiz_N + vert_N # print("horiz_N", horiz_N) # print("vert_N", vert_N) # print("curr_N", curr_N) curr_metric = decimal.Decimal(curr_sum) / decimal.Decimal(curr_N) # print("curr_loc", curr_loc, "curr_metric", curr_metric) if curr_metric < min_metric: min_metric = curr_metric min_breakpoint = curr_loc min_metric_details['sum'] = curr_sum min_metric_details['N_zero'] = curr_N min_distance_right = curr_loc - init_breakpoint_locus # print("min_metric", min_metric, min_breakpoint) # print("min_metric", min_metric, min_breakpoint, min_distance_right) if curr_loc_ind+1 < len(self.precomputed['locus_list']): curr_loc_ind += 1 curr_loc = self.precomputed['locus_list'][curr_loc_ind] else: flat.print_log_msg('curr_locus_index out of bounds') # The possibility of this happening is only at the end of the chromosome (end of last partition) break else: flat.print_log_msg('Warning: breakpoint_index_in_locus_list+1 < len(self.precomputed["locus_list"]) not satisfied!') flat.print_log_msg('Breakpoints: '+repr(self.breakpoints)) flat.print_log_msg('Locus_list: '+repr(self.precomputed['locus_list'])) flat.print_log_msg('breakpoint_index_in_locus_list: '+ repr(breakpoint_index_in_locus_list)) print("min_metric", min_metric, min_breakpoint, min_distance_right) # print("counter", counter) # Reset search for left curr_sum = self.total_sum curr_N = self.total_N # Go LEFT! flat.print_log_msg('Searching left...') if breakpoint_index_in_locus_list-1 >= 0: curr_loc_ind = breakpoint_index_in_locus_list-1 curr_loc = self.precomputed['locus_list'][curr_loc_ind] curr_sum = self.total_sum curr_N = self.total_N while curr_loc > self.snp_first: # Don't include previous breakpoint! curr_sum = curr_sum + self.precomputed['data'][curr_loc]['sum_horiz'] - self.precomputed['data'][curr_loc]['sum_vert'] horiz_N = curr_loc_ind-snp_bottom_ind-1 vert_N = snp_top_ind-curr_loc_ind curr_N = curr_N + horiz_N - vert_N curr_metric = decimal.Decimal(curr_sum) / decimal.Decimal(curr_N) if (curr_metric < min_metric) or (curr_metric == min_metric and (init_breakpoint_locus - curr_loc)<min_distance_right): # min_distance_right is used to compare to RIGHT metric, not within LEFT metric! min_metric = curr_metric min_breakpoint = curr_loc min_metric_details['sum'] = curr_sum min_metric_details['N_zero'] = curr_N if curr_loc_ind-1 >= 0: curr_loc_ind -= 1 curr_loc = self.precomputed['locus_list'][curr_loc_ind] else: flat.print_log_msg('curr_locus_index out of bounds') # The possibility of this happening is only at the beginning of the chromosome (start of first partition) break else: flat.print_log_msg('Warning: breakpoint_index_in_locus_list-1 >=0 not satisfied!') flat.print_log_msg('Breakpoints: '+repr(self.breakpoints)) flat.print_log_msg('Locus_list: '+repr(self.precomputed['locus_list'])) flat.print_log_msg('breakpoint_index_in_locus_list: '+ repr(breakpoint_index_in_locus_list)) self.search_complete = True flat.print_log_msg('Search done') return min_breakpoint, min_metric_details
def __init__(self, name, start_search, stop_search, initial_breakpoint_index, breakpoints, total_sum, total_N, input_config): decimal.getcontext().prec=50 print(" *** start_search", start_search) print(" *** stop_search", stop_search) self.name = name self.start_search = start_search self.stop_search = stop_search self.initial_breakpoint_index = initial_breakpoint_index self.breakpoints = breakpoints self.total_sum = total_sum self.total_N = total_N self.input_config = input_config self.matrix = {} self.locus_list = [] self.locus_list_deleted = [] self.precomputed = {} self.precomputed['locus_list'] = [] # keep the ordering of loci -> allow for efficient iterating self.precomputed['data'] = {} # allow ~O(1) access to each element by it's locus self.dynamic_delete = True self.init_complete = False self.search_complete = False if start_search >= stop_search: raise Exception('Error: start_search >= stop_search') if initial_breakpoint_index>=len(breakpoints) or initial_breakpoint_index<0: raise Exception('Error: initial_breakpoint_index index out of bounds') if breakpoints[initial_breakpoint_index] >= stop_search: raise Exception('Error: breakpoint >= stop_search') if breakpoints[initial_breakpoint_index] <= start_search: raise Exception('Error: breakpoint <= start_search') # tmp_partitions = flat.read_partitions(self.name, self.input_config) tmp_partitions = flat.get_final_partitions(self.input_config, self.name, start_search, stop_search) if start_search < tmp_partitions[0][0] or start_search > tmp_partitions[len(tmp_partitions)-1][1]: raise Exception('Error: start_search is out of bounds') if stop_search < tmp_partitions[0][0] or stop_search > tmp_partitions[len(tmp_partitions)-1][1]: raise Exception('Error: stop_search is out of bounds') if initial_breakpoint_index > 0: if start_search < breakpoints[initial_breakpoint_index-1]: raise Exception('Error: start_search cannot be further than a neighboring breakpoint') else: pass # this is just to emphasize that this has been thought through and covered. It's taken care of when testing for start_search < tmp_partitions[0][0] if initial_breakpoint_index < (len(breakpoints)-1): if stop_search > breakpoints[initial_breakpoint_index+1]: raise Exception('Error: stop_search cannot be further than a neighboring breakpoint') else: pass # this is just to emphasize that this has been thought through and covered. It's taken care of when testing for stop_search > tmp_partitions[len(tmp_partitions)-1][1] # work out snp_first, snp_last - watch out if it's the first or last breakpoint # # snp_first defines where to start reading data # if initial_breakpoint_index > 0: # self.snp_first = breakpoints[initial_breakpoint_index-1] # else: # self.snp_first = tmp_partitions[0][0] # this gets the first SNP in the chromosome (setting it just to 1 would cause flat.relevant_subpartitions() and consequently flat.get_final_partitions() to fail) # The previous (above) was not taking into account start_search, but just assumed where search started! self.snp_first = start_search flat.print_log_msg('snp_first: '+repr(self.snp_first)) # snp_last defined where to stop reading data self.snp_last = stop_search flat.print_log_msg('snp_last: '+repr(self.snp_last)) # This is the upper bound for the search space (upper border) if initial_breakpoint_index+1 < len(breakpoints): self.snp_top = breakpoints[initial_breakpoint_index+1] else: self.snp_top = tmp_partitions[len(tmp_partitions)-1][1] flat.print_log_msg('snp_top: '+repr(self.snp_top)) # This is the bottom bound for the search space (bottom border) if initial_breakpoint_index-1 >= 0: self.snp_bottom = breakpoints[initial_breakpoint_index-1] else: self.snp_bottom = tmp_partitions[0][0] flat.print_log_msg('snp_bottom: '+repr(self.snp_bottom)) # flat.print_log_msg('In local search: ') # flat.print_log_msg(repr(self.snp_first)+' '+repr(self.snp_last)+' '+repr(self.snp_top)) # Data must be read until snp_top! self.partitions = flat.get_final_partitions(self.input_config, self.name, self.snp_bottom, self.snp_top) # flat.print_log_msg('self.partitions: ') # flat.print_log_msg(repr(self.partitions)) self.start_locus = -1 self.start_locus_index = -1 self.end_locus = -1 self.end_locus_index = -1
def init_search_lean(self): # flat.print_log_msg('Removing existing matrix output file') # try: # os.remove(cnst.const['out_matrix_delim']) # except OSError: # pass if not self.dynamic_delete: raise Exception('Error: dynamic_delete should be True for local search!') flat.print_log_msg('Start local search init') print("self.partitions", self.partitions) # pre-read all relevant partitions at beginning! last_p_num = -1 for p_num_init in range(0, len(self.partitions)-1): print("p_num_init", p_num_init) if self.snp_bottom >= self.partitions[p_num_init+1][0]: flat.print_log_msg('Pre-reading partition: '+str(self.partitions[p_num_init])) flat.read_partition_into_matrix_lean(self.partitions, p_num_init, self.matrix, self.locus_list, self.name, self.input_config, self.snp_bottom, self.snp_top) last_p_num = p_num_init else: break print("===", self.snp_bottom, self.snp_top, self.snp_first, self.snp_last) # print("after reading, len is", len(self.locus_list)) curr_locus = -1 # for p_num, p in enumerate(self.partitions): for p_num in range(last_p_num+1, len(self.partitions)): print("p_num", p_num) p = self.partitions[p_num] flat.print_log_msg('Reading partition: '+str(p)) # Data must be read until snp_top! flat.read_partition_into_matrix_lean(self.partitions, p_num, self.matrix, self.locus_list, self.name, self.input_config, self.snp_bottom, self.snp_top) # Determine first locus if curr_locus<0: # Either first partition or not found in first partition # curr_locus = -1 # <- this should have been set to -1 before entering the main for loop if len(self.locus_list)>0: # Find first locus >= snp_bottom for i, locus in enumerate(self.locus_list): if locus >= self.snp_bottom: curr_locus = locus start_locus = locus curr_locus_index = i start_locus_index = i break else: raise Exception('Error: locus_list seems to be empty') # else: # if len(self.locus_list)>0: # curr_locus = self.locus_list[0] # curr_locus_index = 0 # else: # raise Exception('Error: locus_list seems to be empty') else: try: curr_locus_index = self.locus_list.index(curr_locus) # curr_locus is carried from prev iteration, but index has changed since part of matrix (and locus_list) has been deleted except ValueError: if len(self.locus_list)>0: curr_locus = self.locus_list[0] curr_locus_index = 0 else: raise Exception('Error: locus_list seems to be empty') if curr_locus<0: flat.print_log_msg('Warning: curr_locus not found! Continuing to next partition.') flat.print_log_msg('Comment: This is possibly due to snp_bottom being very close to end of partition.') flat.print_log_msg('Details: ') flat.print_log_msg('Partition: '+repr(p)) flat.print_log_msg('snp_bottom: '+repr(self.snp_bottom)) flat.print_log_msg('curr_locus: '+repr(curr_locus)) continue #continue to next partition # raise Exception('Error: curr_locus not found!') # print("start locus is", curr_locus) print("self.snp_last", self.snp_last) if p_num+1 < len(self.partitions): end_locus = self.partitions[p_num+1][0] end_locus_index = -1 else: # end_locus = self.partitions[p_num][1] # Find last locus <= snp_last end_locus_found = False for i in reversed(range(0, len(self.locus_list))): # for locus in reversed(locus_list): if self.locus_list[i] <= self.snp_last: end_locus = self.locus_list[i] # print("setting end locus to", end_locus, "snp last was", self.snp_last) end_locus_index = i end_locus_found = True break if not end_locus_found: end_locus_index = 0 end_locus = self.locus_list[end_locus_index] # flat.print_log_msg('self.locus_list control output: '+repr(self.locus_list)) flat.print_log_msg('Running precompute for partition: '+str(p)) flat.print_log_msg('start_locus: '+repr(start_locus)+' end_locus: '+repr(end_locus)+' end_locus_index '+repr(end_locus_index)) # This will not include the very last SNP of the complete range, but that shouldn't be too important since the end of the range shouldn't be a defining location for LD print("checking that curr_locus is smaller than", curr_locus, end_locus, "oooo") while curr_locus <= end_locus: self.add_locus_to_precomputed(curr_locus) # We want snp_bottom to be added here always (for later use). Same thing for snp_top # flat.print_log_msg('curr_locus: '+repr(curr_locus)+' end_locus: '+repr(end_locus)) if (curr_locus > self.snp_first or self.initial_breakpoint_index == 0) and (curr_locus <= self.snp_last): # Do not include snp_first in the calculation unless the very first block is being taken into account. Do not calculate anything above snp_last, just insert dummies for key, el in self.matrix[curr_locus].items(): # don't take into account anything over snp_top if key <= self.snp_top: # print("adding", curr_locus, key, el) corr_coeff = self.matrix[curr_locus][key] / math.sqrt( self.matrix[curr_locus][curr_locus] * self.matrix[key][key] ) # print("ijval", curr_locus, key, self.matrix[curr_locus][key], corr_coeff, self.matrix[curr_locus][curr_locus], self.matrix[key][key]) # print("ijval", corr_coeff ** 2) # if curr_locus != key: # Don't include diagonal! ...although not that important. self.add_val_to_precomputed(decimal.Decimal(corr_coeff**2), curr_locus, key) # If the diagonal is included, it doesn't matter because later we add and subtract is exactly once when adding and subra # else: # self.add_val_to_precomputed(decimal.Decimal(0), curr_locus, key) else: self.add_val_to_precomputed(decimal.Decimal(0), curr_locus, curr_locus) # Dummy value for snp_first! ...in order to be consistent for some other future use of these data structures if curr_locus_index+1 < len(self.locus_list): curr_locus_index+=1 curr_locus = self.locus_list[curr_locus_index] else: flat.print_log_msg('curr_locus_index out of bounds' + str(len(self.precomputed['data'][curr_locus]['sum_horiz']))) # The possibility of this happening is only at the end of the range [usually chromosome] (end of last partition) break print("len(self.locus_list)", len(self.locus_list)) # flat.delete_loci_smaller_than_and_output_matrix_to_file(end_locus, self.matrix, locus_list, locus_list_deleted, cnst.const['out_matrix_filename']) if self.dynamic_delete: flat.print_log_msg('Deleting loci not required any more') flat.delete_loci_smaller_than_leanest(end_locus, self.matrix, self.locus_list) self.start_locus = start_locus self.start_locus_index = start_locus_index self.end_locus = end_locus self.end_locus_index = end_locus_index self.init_complete = True