def subsample(self, counts: pd.DataFrame) -> pd.DataFrame: input_genes = counts.shape[1] if self.num_cells is None: self.num_cells = int(input_genes / 3) core_logger.info('Subsampling {} to {}'.format(input_genes, self.num_cells)) counts_t = counts.T if self.log: pca_input = np.log1p(counts_t) else: pca_input = counts_t try: u, s, vt = pca(pca_input.values, k=self.num_pc) x_dimred = u[:, :self.num_pc] * s[:self.num_pc] sketch_index = gs(x_dimred, self.num_cells, replace=False) x_matrix = counts_t.iloc[sketch_index] except Exception as e: core_logger.warning('Subsampling failed: ignored.') if self.verbose: core_logger.warning(str(e)) return counts core_logger.info('Done subsampling {} to {}'.format(input_genes, self.num_cells)) return x_matrix.T
def call(meta: pd.DataFrame, counts: pd.DataFrame, threads: int = 4, debug_seed: int = -1, result_precision: int = 3, log2_transform: bool = True ) -> (pd.DataFrame): core_logger.info( '[Cluster Statistical Analysis Simple] ' 'Debug-seed:{} Threads:{} Precision:{} Log2-Transformed: {}'.format(debug_seed, threads, result_precision, log2_transform)) if debug_seed >= 0: pd.np.random.seed(debug_seed) core_logger.warning('Debug random seed enabled. Setted to {}'.format(debug_seed)) core_logger.info('Running Winsorization') winsorized_counts = cpdb_statistical_analysis_helper.log2tf_winsorizer(meta, counts, log2_transform, threads) return build_results(winsorized_counts, result_precision)
def call(meta: pd.DataFrame, counts: pd.DataFrame, interactions: pd.DataFrame, genes: pd.DataFrame, complexes: pd.DataFrame, complex_compositions: pd.DataFrame, iterations: int = 1000, threshold: float = 0.1, threads: int = 4, debug_seed=False, round_decimals: int = 1) -> ( pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame): core_logger.info( '[Cluster Statistical Analysis Complex] Threshold:{} Iterations:{} Debug-seed:{} Threads:{}'.format( threshold, iterations, debug_seed, threads)) if debug_seed >= 0: pd.np.random.seed(debug_seed) core_logger.warning('Debug random seed enabled. Setted to {}'.format(debug_seed)) cells_names = sorted(counts.columns) interactions_filtered, counts_filtered, complex_in_counts = prefilters(interactions, counts, genes, complexes, complex_compositions) if interactions_filtered.empty: return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame() complex_significative_protein = get_complex_significative(complex_in_counts, counts_filtered, complex_compositions, cells_names) clusters = cpdb_statistical_analysis_helper.build_clusters(meta, counts_filtered) core_logger.info('Running Real Complex Analysis') cluster_interactions = cpdb_statistical_analysis_helper.get_cluster_combinations(clusters['names']) interactions_processed = get_interactions_processed(interactions_filtered, complex_significative_protein) base_result = cpdb_statistical_analysis_helper.build_result_matrix(interactions_processed, cluster_interactions) real_mean_analysis = cpdb_statistical_analysis_helper.mean_analysis(interactions_processed, clusters, cluster_interactions, base_result) real_percents_analysis = cpdb_statistical_analysis_helper.percent_analysis(clusters, threshold, interactions_processed, cluster_interactions, base_result) statistical_mean_analysis = cpdb_statistical_analysis_helper.shuffled_analysis(iterations, meta, counts_filtered, interactions_processed, cluster_interactions, base_result, threads) result_percent = cpdb_statistical_analysis_helper.build_percent_result(real_mean_analysis, real_percents_analysis, statistical_mean_analysis, interactions_processed, cluster_interactions, base_result) pvalues_result, means_result, significant_means, mean_pvalue_result, deconvoluted_result = build_results( interactions_filtered, real_mean_analysis, result_percent, clusters['means'], complex_compositions, counts, genes, round_decimals ) return pvalues_result, means_result, significant_means, mean_pvalue_result, deconvoluted_result
def add(self, complexes): """ Uploads complex data from csv. - Creates new complexes in Multidata table - Creates reference in Complex table - Creates complex composition to define complexes. """ if complexes.empty: return existing_complexes = self.database_manager.database.session.query(Multidata.name).all() existing_complexes = [c[0] for c in existing_complexes] proteins = self.database_manager.database.session.query(Multidata.name, Multidata.id_multidata).join( Protein).all() proteins = {p[0]: p[1] for p in proteins} # Get complex composition info complete_indices = [] incomplete_indices = [] missing_proteins = [] complex_map = {} for index, row in complexes.iterrows(): missing = False protein_id_list = [] for protein in ['protein_1', 'protein_2', 'protein_3', 'protein_4']: if not pd.isnull(row[protein]): protein_id = proteins.get(row[protein]) if protein_id is None: missing = True missing_proteins.append(row[protein]) else: protein_id_list.append(protein_id) if not missing: complex_map[row['name']] = protein_id_list complete_indices.append(int(index)) else: incomplete_indices.append(index) if len(incomplete_indices) > 0: core_logger.warning('MISSING PROTEINS:') for protein in missing_proteins: core_logger.warning('MISSING PROTEINS:')(protein) core_logger.warning('COMEPLEXES WITH MISSING PROTEINS:') core_logger.warning(complexes.iloc[incomplete_indices, :]['name']) # Insert complexes if not complexes.empty: # Remove unwanted columns removal_columns = list( [x for x in complexes.columns if 'protein_' in x or 'Name_' in x or 'Unnamed' in x]) # removal_columns += ['comments'] complexes.drop(removal_columns, axis=1, inplace=True) # Remove rows with missing complexes complexes = complexes.iloc[complete_indices, :] # Convert ints to bool bools = ['receptor', 'other', 'secreted_highlight', 'transmembrane', 'secreted', 'peripheral'] complexes[bools] = complexes[bools].astype(bool) # Drop existing complexes complexes = complexes[complexes['name'].apply( lambda x: x not in existing_complexes)] multidata_df = filters.remove_not_defined_columns(complexes.copy(), self.database_manager.get_column_table_names( 'multidata_table')) multidata_df = self._add_complex_optimitzations(multidata_df) multidata_df.to_sql(name='multidata_table', if_exists='append', con=self.database_manager.database.engine, index=False, chunksize=50) # Now find id's of new complex rows new_complexes = self.database_manager.database.session.query(Multidata.name, Multidata.id_multidata).all() new_complexes = {c[0]: c[1] for c in new_complexes} # Build set of complexes complex_set = [] complex_table = [] for complex_name in complex_map: complex_id = new_complexes[complex_name] for protein_id in complex_map[complex_name]: complex_set.append((complex_id, protein_id, len(complex_map[complex_name]))) complex_table.append({'complex_multidata_id': complex_id, 'name': complex_name}) # Insert complex composition complex_set_df = pd.DataFrame(complex_set, columns=['complex_multidata_id', 'protein_multidata_id', 'total_protein']) complex_table_df = pd.DataFrame(complex_table) complex_table_df = pd.merge(complex_table_df, complexes, on='name') filters.remove_not_defined_columns(complex_table_df, self.database_manager.get_column_table_names('complex_table')) complex_table_df.to_sql( name='complex_table', if_exists='append', con=self.database_manager.database.engine, index=False, chunksize=50) complex_set_df.to_sql( name='complex_composition_table', if_exists='append', con=self.database_manager.database.engine, index=False, chunksize=50)
def call( meta: pd.DataFrame, counts: pd.DataFrame, counts_data: str, interactions: pd.DataFrame, pvalue: float, separator: str, iterations: int = 1000, threshold: float = 0.1, threads: int = 4, debug_seed: int = -1, result_precision: int = 3, ) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame): core_logger.info( '[Cluster Statistical Analysis Simple] ' 'Threshold:{} Iterations:{} Debug-seed:{} Threads:{} Precision:{}'. format(threshold, iterations, debug_seed, threads, result_precision)) if debug_seed >= 0: pd.np.random.seed(debug_seed) core_logger.warning( 'Debug random seed enabled. Setted to {}'.format(debug_seed)) interactions_filtered, counts_filtered = prefilters( counts, interactions, counts_data) if interactions_filtered.empty or counts_filtered.empty: return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame() clusters = cpdb_statistical_analysis_helper.build_clusters( meta, counts_filtered) core_logger.info('Running Real Simple Analysis') cluster_interactions = cpdb_statistical_analysis_helper.get_cluster_combinations( clusters['names']) base_result = cpdb_statistical_analysis_helper.build_result_matrix( interactions_filtered, cluster_interactions, separator) real_mean_analysis = cpdb_statistical_analysis_helper.mean_analysis( interactions_filtered, clusters, cluster_interactions, base_result, separator, suffixes=('_1', '_2'), counts_data=counts_data) real_percent_analysis = cpdb_statistical_analysis_helper.percent_analysis( clusters, threshold, interactions_filtered, cluster_interactions, base_result, separator, suffixes=('_1', '_2'), counts_data=counts_data) statistical_mean_analysis = cpdb_statistical_analysis_helper.shuffled_analysis( iterations, meta, counts_filtered, interactions_filtered, cluster_interactions, base_result, threads, separator, suffixes=('_1', '_2'), counts_data=counts_data) result_percent = cpdb_statistical_analysis_helper.build_percent_result( real_mean_analysis, real_percent_analysis, statistical_mean_analysis, interactions_filtered, cluster_interactions, base_result, separator) pvalues_result, means_result, significant_means, deconvoluted_result = build_results( interactions_filtered, real_mean_analysis, result_percent, clusters['means'], result_precision, pvalue, counts_data) return pvalues_result, means_result, significant_means, deconvoluted_result
def call(meta: pd.DataFrame, counts: pd.DataFrame, counts_data: str, interactions: pd.DataFrame, genes: pd.DataFrame, complexes: pd.DataFrame, complex_compositions: pd.DataFrame, pvalue: float, separator: str, iterations: int = 1000, threshold: float = 0.1, threads: int = 4, debug_seed: int = -1, result_precision: int = 3, ) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame): core_logger.info( '[Cluster Statistical Analysis] ' 'Threshold:{} Iterations:{} Debug-seed:{} Threads:{} Precision:{}'.format(threshold, iterations, debug_seed, threads, result_precision)) if debug_seed >= 0: np.random.seed(debug_seed) core_logger.warning('Debug random seed enabled. Setted to {}'.format(debug_seed)) cells_names = sorted(counts.columns) interactions.set_index('id_interaction', drop=True, inplace=True) interactions_reduced = interactions[['multidata_1_id', 'multidata_2_id']].drop_duplicates() complex_compositions.set_index('id_complex_composition', inplace=True, drop=True) # Add id multidata to counts input counts: pd.DataFrame = counts.merge(genes[['id_multidata', 'ensembl', 'gene_name', 'hgnc_symbol']], left_index=True, right_on=counts_data) counts_relations = counts[['id_multidata', 'ensembl', 'gene_name', 'hgnc_symbol']].copy() counts.set_index('id_multidata', inplace=True, drop=True) counts = counts[cells_names] counts = counts.astype('float32') counts = counts.groupby(counts.index).mean() if counts.empty: raise AllCountsFilteredException(hint='Are you using human data?') # End add id multidata interactions_filtered, counts_filtered, complex_composition_filtered = \ cpdb_statistical_analysis_helper.prefilters(interactions_reduced, counts, complexes, complex_compositions) if interactions_filtered.empty: raise NoInteractionsFound() clusters = cpdb_statistical_analysis_helper.build_clusters(meta, counts_filtered, complex_composition_filtered) core_logger.info('Running Real Analysis') cluster_interactions = cpdb_statistical_analysis_helper.get_cluster_combinations(clusters['names']) base_result = cpdb_statistical_analysis_helper.build_result_matrix(interactions_filtered, cluster_interactions, separator) real_mean_analysis = cpdb_statistical_analysis_helper.mean_analysis(interactions_filtered, clusters, cluster_interactions, base_result, separator) real_percents_analysis = cpdb_statistical_analysis_helper.percent_analysis(clusters, threshold, interactions_filtered, cluster_interactions, base_result, separator) core_logger.info('Running Statistical Analysis') statistical_mean_analysis = cpdb_statistical_analysis_helper.shuffled_analysis(iterations, meta, counts_filtered, interactions_filtered, cluster_interactions, complex_composition_filtered, base_result, threads, separator) result_percent = cpdb_statistical_analysis_helper.build_percent_result(real_mean_analysis, real_percents_analysis, statistical_mean_analysis, interactions_filtered, cluster_interactions, base_result, separator) pvalues_result, means_result, significant_means, deconvoluted_result = build_results( interactions_filtered, interactions, counts_relations, real_mean_analysis, result_percent, clusters['means'], complex_composition_filtered, counts, genes, result_precision, pvalue, counts_data ) return pvalues_result, means_result, significant_means, deconvoluted_result
def blend_dataframes(left_df, left_column_names, right_df, db_column_name, db_table_name, quiet=False): result_df = left_df.copy() if not quiet and db_column_name in left_df.columns: core_logger.debug( 'WARNING | BLENDING: column "%s" already exists in orginal df' % (db_column_name)) unique_slug = '_EDITNAME' unique_original_column_names = [("%s%s" % (column_name, unique_slug)) for column_name in left_column_names] result_df.rename(index=str, columns=dict( zip(left_column_names, unique_original_column_names)), inplace=True) not_existent_proteins = [] for i in range(0, len(unique_original_column_names)): result_df = Repository._blend_column( result_df, right_df, unique_original_column_names[i], db_column_name, db_table_name, i + 1) not_existent_proteins = not_existent_proteins + \ result_df[result_df['_merge_%s' % (i + 1)] == 'left_only'][ unique_original_column_names[i]].drop_duplicates().tolist() not_existent_proteins = list(set(not_existent_proteins)) for i in range(1, len(unique_original_column_names) + 1): result_df = result_df[(result_df['_merge_%s' % i] == 'both')] result_df.drop([ '_merge_%s' % merge_column for merge_column in range(1, len(unique_original_column_names) + 1) ] + unique_original_column_names, axis=1, inplace=True) if len(left_column_names) == 1: result_df.rename(index=str, columns={ '%s_1' % db_column_name: db_column_name, '%s_1_id' % db_table_name: '%s_id' % db_table_name }, inplace=True) if not quiet and not_existent_proteins: core_logger.warning( 'WARNING | BLENDING: THIS %s DIDNT EXIST IN %s' % (db_column_name, db_table_name)) core_logger.warning(not_existent_proteins) return result_df