def expmat_construction(exp_file, exp_paramlist, charge_list): mslev = 1 for param in exp_paramlist: mm, mrange_min, mrange_max, mz_range, MZ_SCALE, \ tt, gradient_starttime, gradient_endtime, gradient_time, TIME_SCALE, \ window, shift = exp_paramlist[mslev] print ' from:', exp_file, ' using mass spectrogram from', gradient_starttime, 'to', gradient_endtime, 'minutes' exp_df = pd.read_pickle(exp_file) # transform exp_df_head from str to numeric exp_df_head = ['ind', 'mslev', 'bpmz', 'bpint', 'starttime'] for each in exp_df_head: exp_df[each] = pd.to_numeric(exp_df[each]) # drop out of range time exp_df = exp_df[exp_df['starttime'] >= gradient_starttime] exp_df = exp_df[exp_df['starttime'] < gradient_endtime] # combine array and its bp to list of float for bp, ar, combine in zip(['bpmz', 'bpint'], ['mzarray', 'intarray'], ['allmz', 'allint']): exp_df[combine] = exp_df[bp].apply(lambda x: [x]) + exp_df[ar] ## Create index exp_df['starttime'] = time_index(exp_df['starttime'], gradient_starttime, tt) exp_df['allmz'] = mz_index(exp_df['allmz'].values, mrange_min, mrange_max, mm) exp_df = exp_df[['ind', 'starttime', 'allmz', 'allint']] time_col = [] time_col_temp = [] for index, row in exp_df.iterrows(): # remove out of range m row['allint'] = [ i for m, i in zip(row['allmz'], row['allint']) if m >= 0 and m < MZ_SCALE ] row['allmz'] = [m for m in row['allmz'] if m >= 0 and m < MZ_SCALE] # use bincount to sum int at same mz_index to create time_index col with MZ_SCALE length timecol_array = np.bincount(row['allmz'], row['allint'], minlength=(MZ_SCALE)) timecol_array[timecol_array < 1] = 0 time_col_temp.append(timecol_array) # append each row, int sum if index % 500 == 0: time_col.extend(time_col_temp) time_col_temp = [] # flush last time_col.extend(time_col_temp) exp_df['allint_overlap'] = time_col expdf_row = np.tile(np.arange(MZ_SCALE), exp_df.shape[0]) expdf_col = np.repeat(exp_df['starttime'].values, MZ_SCALE) expdf_value = np.concatenate(exp_df['allint_overlap'].values) exp_mat = sparse.coo_matrix((expdf_value,\ (expdf_row, expdf_col)), \ shape=(MZ_SCALE, TIME_SCALE)) exp_mat = smoothingtime_mat(exp_mat, window, shift) exp_mat, mat_mean = rescale_mat(exp_mat) return exp_mat, exp_paramlist, mat_mean
def h_prediction(initRT_tuple, initRT_width, Hpeak_mean, noise_number, globalparam_list, gaussian_width): gc.disable() for param in globalparam_list: mm, mrange_min, mrange_max, mz_range, MZ_SCALE, \ tt, gradient_starttime, gradient_endtime, gradient_time, TIME_SCALE, \ window, shift = globalparam_list[1] peakside_index = int(gaussian_width / 2 * tt) #before smooth peakwidth_index = int((peakside_index * 2) + 1) peakSD_index = peakwidth_index / 4 rt_index = time_index([x[2] for x in initRT_tuple], gradient_starttime, tt) keeprow, keepcol, keepdata = [], [], [] for idx, rt in enumerate(rt_index): if rt < 0: rt = 0 elif rt > TIME_SCALE - 1: rt = TIME_SCALE - 1 H_row = np.zeros(TIME_SCALE) left, right = rt - peakside_index, rt + peakside_index peak_at = np.linspace(left, right, peakwidth_index) peak_int = norm.pdf(peak_at, rt, peakSD_index) * Hpeak_mean # delete the plot which is located out of range mask = [(peak_at >= 0) & (peak_at < TIME_SCALE)] peak_at = peak_at[tuple(mask)] peak_int = peak_int[tuple(mask)] nonzero = np.arange(peak_at[0], peak_at[-1] + 1) keeprow.append([idx] * len(nonzero)) keepcol.append(peak_at) keepdata.append(peak_int) keeprow, keepcol, keepdata = flatten(keeprow), flatten(keepcol), flatten( keepdata) H_mat = sparse.coo_matrix((keepdata, (keeprow, keepcol)), shape=(len(initRT_tuple), TIME_SCALE)) H_mat = smoothingtime_mat(H_mat, window, shift) initRT_correct = [x[2] for x in initRT_tuple] initRT_correct = [ gradient_starttime if x < gradient_starttime else x for x in initRT_correct ] initRT_correct = [ gradient_endtime if x > gradient_endtime else x for x in initRT_correct ] initRT_correct_keep = [] for ind, (a, b, c) in enumerate(initRT_tuple): initRT_correct_keep.append((a, b, c, initRT_correct[ind])) del (initRT_correct) return H_mat, initRT_correct_keep