def log_stats(self, vals): eps = 1e-16 nreps = len(vals) vals = np.asarray(vals)+eps g_mean = gmean(vals)-eps g_std = gstd(vals) return g_mean, g_mean*(g_std**(-1.96/np.sqrt(nreps))), g_mean*(g_std**(1.96/np.sqrt(nreps)))
def cochrans(input_list): # confidence = 0.05 error = sem(input_list) std = stats.gstd(input_list) # z = stats.zscore(input_list) z = 1.96 n = (((z ** 2) * .25) / (error ** 2)) return int(n)
def sample_size(input_list): # confidence = 0.05 error = sem(input_list) std = stats.gstd(input_list) # z = stats.zscore(input_list) z = 1.96 n = ((z * std) / error) ** 2 return int(n)
def trace_length(log): trace_lengths = [] n_events = 0 for trace in log: n_events += len(trace) trace_lengths.append(len(trace)) trace_len_min = np.min(trace_lengths) trace_len_max = np.max(trace_lengths) trace_len_mean = np.mean(trace_lengths) trace_len_median = np.median(trace_lengths) trace_len_mode = stats.mode(trace_lengths)[0][0] trace_len_std = np.std(trace_lengths) trace_len_variance = np.var(trace_lengths) trace_len_q1 = np.percentile(trace_lengths, 25) trace_len_q3 = np.percentile(trace_lengths, 75) trace_len_iqr = stats.iqr(trace_lengths) trace_len_geometric_mean = stats.gmean(trace_lengths) trace_len_geometric_std = stats.gstd(trace_lengths) trace_len_harmonic_mean = stats.hmean(trace_lengths) trace_len_skewness = stats.skew(trace_lengths) trace_len_kurtosis = stats.kurtosis(trace_lengths) trace_len_coefficient_variation = stats.variation(trace_lengths) trace_len_entropy = stats.entropy(trace_lengths) trace_len_hist, _ = np.histogram(trace_lengths, density=True) trace_len_skewness_hist = stats.skew(trace_len_hist) trace_len_kurtosis_hist = stats.kurtosis(trace_len_hist) return [ n_events, trace_len_min, trace_len_max, trace_len_mean, trace_len_median, trace_len_mode, trace_len_std, trace_len_variance, trace_len_q1, trace_len_q3, trace_len_iqr, trace_len_geometric_mean, trace_len_geometric_std, trace_len_harmonic_mean, trace_len_skewness, trace_len_kurtosis, trace_len_coefficient_variation, trace_len_entropy, *trace_len_hist, trace_len_skewness_hist, trace_len_kurtosis_hist, ]
def comparacion_utilidadades(self): print('CALCULANDO ALFA ÓPTIMO Y PEOR...') self.senal_actualizar_avance.emit( ('CALCULANDO ALFA ÓPTIMO Y PEOR...', 0)) self.calcular_alfas() print('CALCULANDO INTERVALOS DE CONFIANZA...') self.senal_actualizar_avance.emit( ('CALCULANDO INTERVALOS DE CONFIANZA...', self.iteraciones_busqueda_alfas)) iteracion = 0 valores_buenos = [] valores_malos = [] while iteracion < self.iteraciones_validacion_alfas: bueno = self.calcular_utilidad(self.alfa_bueno) malo = self.calcular_utilidad(self.alfa_malo) valores_buenos.append(bueno) valores_malos.append(malo) iteracion += 1 print(iteracion) self.senal_actualizar_avance.emit( ('CALCULANDO INTERVALOS DE CONFIANZA...', iteracion + self.iteraciones_busqueda_alfas)) self.intervalo_bueno = st.norm.interval(self.confianza_intervalos, loc=np.mean(valores_buenos), scale=st.gstd(valores_buenos)) self.intervalo_malo = st.norm.interval(self.confianza_intervalos, loc=np.mean(valores_malos), scale=st.gstd(valores_malos)) self.senal_actualizar_avance.emit( ('CALCULANDO INTERVALOS DE CONFIANZA...', iteracion + self.iteraciones_busqueda_alfas)) self.senal_terminar.emit({ 'utilidad_buena': self.intervalo_bueno, 'utilidad_mala': self.intervalo_malo, 'todas': self.intervalo_todas_las_ut })
def get_cgm_stats(self, start_date, end_date): """ Compute cgm stats with dates Args: start_date (dt.DateTime): start date end_date (dt.DateTime): end date Returns: (float, float): geo mean and std """ cgm_values = [] for time, cgm_event in self.glucose_timeline.items(): if start_date <= time <= end_date: cgm_value = cgm_event.get_value() cgm_values.append(cgm_value) return gmean(cgm_values), gstd(cgm_values)
def combine_triplicates(plate_df_in, checks_include, master, use_master_curve): ''' Flag outliers via grubbs test Calculate the Cq means, Cq stds, counts before & after removing outliers Params plate_df_in: qpcr data in pandas df, must be 1 plate with 1 target should be in the format from QuantStudio3 with columns 'Target', 'Sample', 'Cq' checks_include: which way to check for outliers options are ( 'grubbs_only', None) Returns plate_df: same data, with additional columns depending on checks_include Cq_mean (calculated mean of Cq after excluding outliers) Q_QuantStudio_std (calculated standard deviation based on QuantStudio output) for intrassay coefficient of variation Note: Cq_raw preserves the raw values, Cq_fin is after subbing and outlier removal, and plain Cq_subbed is after subbing (so that it goes through grubbs) ''' if (checks_include not in ['grubbs_only', None]): raise ValueError('''invalid input, must be 'grubbs_only' or None''') if len(plate_df_in.Target.unique()) > 1: raise ValueError('''More than one target in this dataframe''') target = plate_df_in.Target.unique() plate_df = plate_df_in.copy() # fixes pandas warning groupby_list = [ 'plate_id', 'Sample', 'sample_full', 'Sample_plate', 'Target', 'Task', 'inhibition_testing', 'is_dilution', "dilution" ] # make copy of Cq column and later turn this to np.nan for outliers plate_df['Cq_raw'] = plate_df['Cq'].copy() plate_df["master_curve_bloq_qpcr_reps"] = False if ((use_master_curve) & (target[0] != "Xeno")): plate_df.loc[(np.isnan(plate_df.Cq)) | (plate_df.Cq > 40), "master_curve_bloq_qpcr_reps"] = True plate_df.loc[(np.isnan(plate_df.Cq)) | (plate_df.Cq > 40), "Cq"] = master.loc[master.Target == target[0], "LoD_Cq"].item() plate_df['Cq_subbed'] = plate_df['Cq'].copy() plate_df['Cq_fin'] = plate_df['Cq'].copy() # grubbs with scikit if checks_include in ['all', 'grubbs_only']: plate_df = get_pass_grubbs_test(plate_df, ['Sample']) plate_df.loc[plate_df.grubbs_test == False, 'Cq_fin'] = np.nan # summarize to get mean, std, counts with and without outliers removed plate_df_avg = plate_df.groupby(groupby_list).agg( raw_Cq_values=('Cq_raw', list), sub_Cq_values=('Cq_subbed', list), outlier_Cq_values=('Cq_fin', list), # grubbs_check=('grubbs_test', list), template_volume=('template_volume', 'max'), Q_init_mean=( 'Quantity', 'mean' ), #only needed to preserve quantity information for standards later Q_init_std=('Quantity', 'std'), Q_init_gstd=('Quantity', lambda x: np.nan if ((len(x.dropna()) < 2) | all(np.isnan(x))) else (sci.gstd(x.dropna(), axis=0))), # Q_QuantStudio_std = ('Quantity', 'std'), Cq_init_mean=('Cq_raw', 'mean'), Cq_init_std=('Cq_raw', 'std'), Cq_init_min=('Cq_raw', 'min'), replicate_init_count=('Cq', 'count'), Cq_mean=('Cq_fin', 'mean'), Cq_std=('Cq_fin', 'std'), replicate_count=('Cq_fin', 'count'), is_undetermined_count=('is_undetermined', 'sum'), is_bloq_count=('master_curve_bloq_qpcr_reps', 'sum')) # note: count in agg will exclude nan plate_df_avg = plate_df_avg.reset_index() return (plate_df, plate_df_avg)
def process_unknown(plate_df, std_curve_info, use_master_curve, master): ''' Calculates quantity based on Cq_mean and standard curve Params plate_df: output from combine_triplicates(); df containing Cq_mean must be single plate with single target std_curve_info: output from process_standard() as a list Returns unknown_df: the unknown subset of plate_df, with new columns Quantity_mean q_diff Cq_of_lowest_sample_quantity: the Cq value of the lowest pt used on the plate these columns represent the recalculated quantity using Cq mean and the slope and intercept from the std curve qpcr_coefficient_var the coefficient of variation for qpcr technical triplicates intraassay_var intraassay variation (arithmetic mean of the coefficient of variation for all triplicates on a plate) ''' [ num_points, Cq_of_lowest_std_quantity, lowest_std_quantity, Cq_of_lowest_std_quantity_gsd, slope, intercept, r2, efficiency ] = std_curve_info unknown_df = plate_df[plate_df.Task != 'Standard'].copy() unknown_df['Cq_of_lowest_sample_quantity'] = np.nan unknown_df['percent_CV'] = ( unknown_df['Q_init_gstd'] - 1 ) * 100 #the geometric std - 1 is the coefficient of variation using quant studio quantities to capture all the variation in the plate if all(np.isnan(unknown_df['percent_CV'])): unknown_df['intraassay_var'] = np.nan #avoid error else: unknown_df['intraassay_var'] = np.nanmean(unknown_df['percent_CV']) # Set the Cq of the lowest std quantity for different ssituations if len(unknown_df.Task) == 0: #only standard curve plate unknown_df['Cq_of_lowest_sample_quantity'] = np.nan else: if all(np.isnan( unknown_df.Cq_mean)): #plate with all undetermined samples unknown_df['Cq_of_lowest_sample_quantity'] = np.nan #avoid error else: targs = unknown_df.Target.unique() #other plates (most cases) for target in targs: unknown_df.loc[(unknown_df.Target == target), 'Cq_of_lowest_sample_quantity'] = np.nanmax( unknown_df.loc[( unknown_df.Target == target), 'Cq_mean']) #because of xeno unknown_df['Quantity_mean'] = np.nan unknown_df['q_diff'] = np.nan if ~use_master_curve: unknown_df['Quantity_mean'] = 10**( (unknown_df['Cq_mean'] - intercept) / slope) #initialize columns unknown_df['Quantity_std_combined_after'] = np.nan unknown_df['Quantity_mean_combined_after'] = np.nan for row in unknown_df.itertuples(): ix = row.Index filtered_1 = [ element for element in row.raw_Cq_values if ~np.isnan(element) ] #initial nas filtered = [ 10**((element - intercept) / slope) for element in filtered_1 ] if (len(filtered) > 1): filtered = [ element for element in filtered if ~np.isnan(element) ] #nas introduced when slope and interceptna if (len(filtered) > 1): if (row.Target != "Xeno"): unknown_df.loc[ ix, "Quantity_mean_combined_after"] = sci.gmean( filtered) if (all(x > 0 for x in filtered)): unknown_df.loc[ ix, "Quantity_std_combined_after"] = sci.gstd( filtered) if use_master_curve: targs = unknown_df.Target.unique() for targ in targs: if targ != "Xeno": unknown_df["blod_master_curve"] = False m_b = master.loc[master.Target == targ, "b"].item() m_m = master.loc[master.Target == targ, "m"].item() lowest = master.loc[master.Target == targ, "lowest_quantity"].item() lod = master.loc[master.Target == targ, "LoD_quantity"].item() unknown_df.loc[unknown_df.Target == targ, 'Quantity_mean'] = 10**( (unknown_df.loc[unknown_df.Target == targ, 'Cq_mean'] - m_b) / m_m) unknown_df.loc[unknown_df.Quantity_mean < lowest, "blod_master_curve"] = True unknown_df.loc[unknown_df.Quantity_mean < lowest, 'Quantity_mean'] = lod else: unknown_df["blod_master_curve"] = False # if Cq_mean is zero, don't calculate a quantity (turn to NaN) unknown_df.loc[unknown_df[unknown_df.Cq_mean == 0].index, 'Quantity_mean'] = np.nan unknown_df[ 'q_diff'] = unknown_df['Q_init_mean'] - unknown_df['Quantity_mean'] return (unknown_df)
def calcular_alfas(self): ''' alfa_0: ALFA INICIAL var: VARIACION DE CADA ALFA POR ITERACIÓN iter_max: CANTIDAD DE ITERACIONES RETORNA incumbent_ut: MEJOR UTILIDAD incumbent_alfa: MEJOR ALFA mala_ut: PEOR UTILIDAD mal_alfa: PEOR ALFA nova_ut: LISTA DE TUPLAS (MEJOR UTILIDAD HASTA EL MOMENTO, MEJOR ALFA HASTA EL MOMENTO) malas_utilidades: LISTA DE TUPLAS (PEOR UTILIDAD HASTA EL MOMENTO, PEOR ALFA HASTA EL MOMENTO) maximos_por_iter: LISTE DE MEJOR UTILIDAD POR ITERACIÓN ''' incumbent_ut = self.calcular_utilidad(self.alfa_0) incumbent_alfa = self.alfa_0 nova_ut = [(incumbent_ut, incumbent_alfa)] mala_ut = incumbent_ut mejor_alfa = incumbent_alfa mal_alfa = incumbent_alfa malas_utilidades = [(mala_ut, mal_alfa)] todas_las_utilidades = [incumbent_ut] iteracion = 0 maximo_igual = 0 minimo_igual = 0 pond = 1 while iteracion < self.iteraciones_busqueda_alfas and maximo_igual < self.iteraciones_igual_res: seed() print(maximo_igual, self.iteraciones_igual_res, incumbent_ut) alfas = [self.nuevo_alpha(mejor_alfa, pond) for _ in range(5)] print(alfas) utilidades = [] for alpha in alfas: ut = self.calcular_utilidad(alpha) utilidades.append(ut) mejor_utilidad = max(utilidades) mejor_alfa = alfas[utilidades.index(mejor_utilidad)] self.maximos.append(mejor_utilidad) peor_utilidad = min(utilidades) peor_alfa = alfas[utilidades.index(peor_utilidad)] todas_las_utilidades += utilidades if peor_utilidad < mala_ut: mala_ut = peor_utilidad mal_alfa = peor_alfa malas_utilidades.append((mala_ut, mal_alfa)) minimo_igual = 0 if mejor_utilidad > incumbent_ut: incumbent_ut = mejor_utilidad incumbent_alfa = mejor_alfa nova_ut.append((incumbent_ut, incumbent_alfa)) maximo_igual = 0 maximo_igual += 1 self.iteracion_busqueda += 1 print(self.iteracion_busqueda) if maximo_igual < self.iteraciones_igual_res * 0.5: pond = 1 if maximo_igual >= self.iteraciones_igual_res * 0.25: pond = 2 elif maximo_igual >= self.iteraciones_igual_res * 0.25: pond = 3 elif maximo_igual >= self.iteraciones_igual_res * 0.75: pond = 4 self.senal_actualizar_avance.emit( ('CALCULANDO ALFA ÓPTIMO Y PEOR...', iteracion)) self.ut_buena = incumbent_ut self.alfa_bueno = incumbent_alfa self.ut_mala = mala_ut self.alfa_malo = mal_alfa self.mejores_utilidades = nova_ut self.peores_utilidades = malas_utilidades self.intervalo_todas_las_ut = st.norm.interval( self.confianza_intervalos, loc=np.mean(todas_las_utilidades), scale=st.gstd(todas_las_utilidades))
def log_stats(self, vals): eps = 1e-16 vals = np.asarray(vals) + eps g_mean = gmean(vals) - eps g_std = gstd(vals) return g_mean, g_mean * (g_std**(-1.96)), g_mean * (g_std**(1.96))
def sample_size(input_list): error = sem(input_list) std = stats.gstd(input_list) z = stats.zscore(input_list) n = ((z * std) / error)**2 return int(n)
try: part1Time = int(row[2]) part2Time = int(row[5]) record.append([part1Time, part2Time]) except Exception: continue scores = [] logscores = [] for row in record: for i in row: scores.append(float(i)) logscores.append(log(float(i))) gmean = gmean(scores) gstd = gstd(scores) logmean = tmean(logscores) logstd = tstd(logscores) logshapiro = shapiro(logscores) lower = exp(logmean - 2 * logstd) upper = exp(logmean + 2 * logstd) print("\ Score:\n\ Geometric mean: {:.6g}\n\ Geometric standard deviation: {:.6g}\n\ Test of lognormality (Shapiro-Wilk): {:.6g} (p = {:.6g})\n\ Prediction interval (95%): {:.6g} - {:.6g} (lognormal dist.)\ ".format(round(gmean), gstd, logshapiro[0], logshapiro[1], round(lower), round(upper)))
def xeno_inhibition_test(qpcr_data, qpcr_normd, x=1): ''' Calculates the difference in Ct compared to the NTC for xeno inhibition test, outputs a list of inhibited samples Params optional x: the dCt defined as inhibited qpcr_data (main dfm) qpcr_data_xeno-- dataframe with qpcr technical triplicates averaged. Requires the columns Target (includes xeno) plate_id Well Quantity_mean Sample Task Returns qpcr_data with is_inhibited column xeno_fin_all -- calculates the difference in Ct values of the negative control (spiked with xeno) to the sample spiked with xeno, adds column for inhibited (Yes or No) ntc_col -- all of the negative control values for xeno ''' #Find targets other than xeno for each well+plate combination p_w_targets = qpcr_data[qpcr_data.Target != 'Xeno'].copy() p_w_targets['p_id'] = p_w_targets.plate_id.astype('str').str.cat( p_w_targets.Well.astype('str'), sep="_") p_w_targets = p_w_targets.groupby('p_id')['Target'].apply( lambda targs: ','.join(targs)).reset_index() p_w_targets.columns = ['p_id', 'additional_target'] #subset out xeno samples, merge with previous, use to calculate mean and std target = qpcr_data[( qpcr_data.Target == 'Xeno')].copy() #includes NTC & stds target['p_id'] = qpcr_data.plate_id.astype('str').str.cat(qpcr_data.Well, sep="_") target = target.merge(p_w_targets, how='left', on='p_id') if target.additional_target.astype('str').str.contains(',').any(): print(target.additional_target.unique()) raise ValueError( 'Error: update function, more than 2 multiplexed targets or one of the two multiplexed targets is not xeno' ) target_s = target.groupby([ "Sample", "sample_full", 'additional_target', 'plate_id', 'Task' ]).agg( Ct_vet_mean=('Cq', lambda x: np.nan if all(np.isnan(x)) else sci.gmean(x.dropna(), axis=0)), Quantity_std_crv=('Quantity', 'max'), #just for standards Ct_vet_std=('Cq', lambda x: np.nan if ((len(x.dropna()) < 2) | all(np.isnan(x))) else (sci.gstd(x.dropna(), axis=0))), Ct_vet_count=('Cq', 'count')).reset_index() target = target_s[(target_s.Task != 'Standard')].copy() #remove standards #subset and recombine to get NTC as a col ntc_col_c = target[target.Task == 'Negative Control'].copy() ntc_col = ntc_col_c[["plate_id", 'additional_target', 'Ct_vet_mean']].copy() ntc_col.columns = ["plate_id", 'additional_target', 'Ct_control_mean'] ntc_col_c = ntc_col_c[[ "plate_id", 'Task', 'Quantity_std_crv', 'additional_target', 'Ct_vet_mean' ]].copy() ntc_col_c.columns = [ "plate_id", 'Task', 'Quantity_std_crv', 'additional_target', 'Ct_control_mean' ] std_col = target_s[target_s.Task == 'Standard'].copy() std_col = std_col[[ "plate_id", 'Task', 'Quantity_std_crv', 'additional_target', 'Ct_vet_mean' ]].copy() std_col.columns = [ "plate_id", 'Task', 'Quantity_std_crv', 'additional_target', 'Ct_control_mean' ] xeno_fin_all = target[target.Task == 'Unknown'].copy() xeno_fin_all = xeno_fin_all.merge(ntc_col, how='left') xeno_fin_all["dCt"] = (xeno_fin_all["Ct_vet_mean"] - xeno_fin_all["Ct_control_mean"]) xeno_fin_all["inhibited"] = 'No' xeno_fin_all.loc[(xeno_fin_all.dCt > x), "inhibited"] = "Yes" ntc_std_control = ntc_col_c.append(std_col) inhibited = xeno_fin_all[xeno_fin_all.dCt > 1].Sample.unique() not_inhibited = xeno_fin_all[xeno_fin_all.dCt <= 1].Sample.unique() qpcr_normd["is_inhibited"] = 'unknown' qpcr_normd.loc[qpcr_normd.Sample.isin(inhibited), "is_inhibited"] = True qpcr_normd.loc[qpcr_normd.Sample.isin(not_inhibited), "is_inhibited"] = False return qpcr_normd, xeno_fin_all, ntc_std_control