def _quantile_normalize_matrix(target_vector, original_matrix): preprocessCore = importr("preprocessCore") as_numeric = rlang("as.numeric") data_matrix = rlang("data.matrix") # Convert the smashed frames to an R numeric Matrix target_vector = as_numeric(target_vector) # Do so in chunks if the matrix is too large. if original_matrix.shape[1] <= QN_CHUNK_SIZE: merged_matrix = data_matrix(original_matrix) normalized_matrix = preprocessCore.normalize_quantiles_use_target( x=merged_matrix, target=target_vector, copy=True ) # And finally convert back to Pandas ar = np.array(normalized_matrix) new_merged = pd.DataFrame(ar, columns=original_matrix.columns, index=original_matrix.index) else: matrix_chunks = _split_dataframe_columns(original_matrix, QN_CHUNK_SIZE) for i, chunk in enumerate(matrix_chunks): R_chunk = data_matrix(chunk) normalized_chunk = preprocessCore.normalize_quantiles_use_target( x=R_chunk, target=target_vector, copy=True ) ar = np.array(normalized_chunk) start_column = i * QN_CHUNK_SIZE end_column = (i + 1) * QN_CHUNK_SIZE original_matrix.iloc[:, start_column:end_column] = ar new_merged = original_matrix return new_merged
def _verify_result(job_context: Dict) -> Dict: """ Statistically verify this is a sane result More info: https://github.com/AlexsLemonade/refinebio/issues/599#issuecomment-422132009 """ import rpy2 from rpy2.robjects import pandas2ri from rpy2.robjects import r as rlang from rpy2.robjects.packages import importr qn_target_frame = pd.read_csv(job_context['target_file'], sep='\t', header=None, index_col=None, error_bad_lines=False) smashed_frame = job_context['final_frame'] pandas2ri.activate() preprocessCore = importr('preprocessCore') as_matrix = rlang("as.matrix") as_vector = rlang("as.vector") data_matrix = rlang('data.matrix') all_equal = rlang('all.equal') rb_target_vector = as_vector(as_matrix(qn_target_frame[0])) exprs_mat = data_matrix(smashed_frame) qn_target = preprocessCore.normalize_quantiles_determine_target(exprs_mat) is_equal = all_equal(qn_target, rb_target_vector) if bool(is_equal): job_context['result_verified'] = True return job_context else: job_context['result_verified'] = False job_context['success'] = False job_context['job'].failure_reason = "Failed QN check!" return job_context
def _quantile_normalize(job_context: Dict, ks_check=True, ks_stat=0.001) -> Dict: """ Apply quantile normalization. """ # Prepare our QN target file organism = job_context['organism'] qn_target = utils.get_most_recent_qn_target_for_organism(organism) if not qn_target: logger.error( "Could not find QN target for Organism!", organism=organism, dataset_id=job_context['dataset'].id, dataset_data=job_context['dataset'].data, processor_job_id=job_context["job"].id, ) job_context['dataset'].success = False job_context[ 'job'].failure_reason = "Could not find QN target for Organism: " + str( organism) job_context[ 'dataset'].failure_reason = "Could not find QN target for Organism: " + str( organism) job_context['dataset'].save() job_context['job'].success = False job_context[ 'failure_reason'] = "Could not find QN target for Organism: " + str( organism) return job_context else: qn_target_path = qn_target.sync_from_s3() qn_target_frame = pd.read_csv(qn_target_path, sep='\t', header=None, index_col=None, error_bad_lines=False) # Prepare our RPy2 bridge pandas2ri.activate() preprocessCore = importr('preprocessCore') as_numeric = rlang("as.numeric") data_matrix = rlang('data.matrix') # Convert the smashed frames to an R numeric Matrix # and the target Dataframe into an R numeric Vector target_vector = as_numeric(qn_target_frame[0]) merged_matrix = data_matrix(job_context['merged_no_qn']) # Perform the Actual QN reso = preprocessCore.normalize_quantiles_use_target( x=merged_matrix, target=target_vector, copy=True) # Verify this QN, related: https://github.com/AlexsLemonade/refinebio/issues/599#issuecomment-422132009 set_seed = rlang("set.seed") combn = rlang("combn") ncol = rlang("ncol") ks_test = rlang("ks.test") which = rlang("which") set_seed(123) n = ncol(reso)[0] m = 2 if n >= m: combos = combn(ncol(reso), 2) # Convert to NP, Shuffle, Return to R ar = np.array(combos) np.random.shuffle(np.transpose(ar)) nr, nc = ar.shape combos = ro.r.matrix(ar, nrow=nr, ncol=nc) # adapted from # https://stackoverflow.com/questions/9661469/r-t-test-over-all-columns # apply KS test to randomly selected pairs of columns (samples) for i in range(1, min(ncol(combos)[0], 100)): value1 = combos.rx(1, i)[0] value2 = combos.rx(2, i)[0] test_a = reso.rx(True, value1) test_b = reso.rx(True, value2) # RNA-seq has a lot of zeroes in it, which # breaks the ks_test. Therefore we want to # filter them out. To do this we drop the # lowest half of the values. If there's # still zeroes in there, then that's # probably too many zeroes so it's okay to # fail. median_a = np.median(test_a) median_b = np.median(test_b) # `which` returns indices which are # 1-indexed. Python accesses lists with # zero-indexes, even if that list is # actually an R vector. Therefore subtract # 1 to account for the difference. test_a = [test_a[i - 1] for i in which(test_a > median_a)] test_b = [test_b[i - 1] for i in which(test_b > median_b)] # The python list comprehension gives us a # python list, but ks_test wants an R # vector so let's go back. test_a = as_numeric(test_a) test_b = as_numeric(test_b) ks_res = ks_test(test_a, test_b) statistic = ks_res.rx('statistic')[0][0] pvalue = ks_res.rx('p.value')[0][0] job_context['ks_statistic'] = statistic job_context['ks_pvalue'] = pvalue # We're unsure of how strigent to be about # the pvalue just yet, so we're extra lax # rather than failing tons of tests. This may need tuning. if ks_check: if statistic > ks_stat or pvalue < 0.8: job_context['ks_warning'] = ( "Failed Kolmogorov Smirnov test! Stat: " + str(statistic) + ", PVal: " + str(pvalue)) else: logger.warning( "Not enough columns to perform KS test - either bad smash or single saple smash.", dset=job_context['dataset'].id) # And finally convert back to Pandas ar = np.array(reso) new_merged = pd.DataFrame(ar, columns=job_context['merged_no_qn'].columns, index=job_context['merged_no_qn'].index) job_context['merged_qn'] = new_merged merged = new_merged return job_context
def _test_qn(merged_matrix): """ Selects a list of 100 random pairs of columns and performs the KS Test on them. Returns a list of tuples with the results of the KN test (statistic, pvalue) """ # Verify this QN, related: # https://github.com/AlexsLemonade/refinebio/issues/599#issuecomment-422132009 data_matrix = rlang("data.matrix") as_numeric = rlang("as.numeric") set_seed = rlang("set.seed") combn = rlang("combn") ncol = rlang("ncol") ks_test = rlang("ks.test") which = rlang("which") merged_R_matrix = data_matrix(merged_matrix) set_seed(123) n = ncol(merged_R_matrix)[0] m = 2 # Not enough columns to perform KS test - either bad smash or single sample smash. if n < m: return None # This wont work with larger matricies # https://github.com/AlexsLemonade/refinebio/issues/1860 ncolumns = ncol(merged_R_matrix) if ncolumns[0] <= 200: # Convert to NP, Shuffle, Return to R combos = combn(ncolumns, 2) ar = np.array(combos) np.random.shuffle(np.transpose(ar)) else: indexes = [*range(ncolumns[0])] np.random.shuffle(indexes) ar = np.array([*zip(indexes[0:100], indexes[100:200])]) nr, nc = ar.shape combos = ro.r.matrix(ar, nrow=nr, ncol=nc) result = [] # adapted from # https://stackoverflow.com/questions/9661469/r-t-test-over-all-columns # apply KS test to randomly selected pairs of columns (samples) for i in range(1, min(ncol(combos)[0], 100)): value1 = combos.rx(1, i)[0] value2 = combos.rx(2, i)[0] test_a = merged_R_matrix.rx(True, value1) test_b = merged_R_matrix.rx(True, value2) # RNA-seq has a lot of zeroes in it, which # breaks the ks_test. Therefore we want to # filter them out. To do this we drop the # lowest half of the values. If there's # still zeroes in there, then that's # probably too many zeroes so it's okay to # fail. median_a = np.median(test_a) median_b = np.median(test_b) # `which` returns indices which are # 1-indexed. Python accesses lists with # zero-indexes, even if that list is # actually an R vector. Therefore subtract # 1 to account for the difference. test_a = [test_a[i - 1] for i in which(test_a > median_a)] test_b = [test_b[i - 1] for i in which(test_b > median_b)] # The python list comprehension gives us a # python list, but ks_test wants an R # vector so let's go back. test_a = as_numeric(test_a) test_b = as_numeric(test_b) ks_res = ks_test(test_a, test_b) statistic = ks_res.rx("statistic")[0][0] pvalue = ks_res.rx("p.value")[0][0] result.append((statistic, pvalue)) return result