def call(gene_expanded, output_columns): filters.remove_not_defined_columns(gene_expanded, output_columns) gene_expanded.drop(['id_gene', 'protein_id'], axis=1, inplace=True) gene_expanded.rename(index=str, columns={'name': 'uniprot'}, inplace=True) return gene_expanded
def add(self, interactions): interaction_df = self.blend_dataframes(interactions, ['partner_a', 'partner_b'], self.database_manager.get_repository('multidata').get_all_name_id(), 'name', 'multidata') filters.remove_not_defined_columns(interaction_df, self.database_manager.get_column_table_names('interaction_table')) interaction_df.to_sql(name='interaction_table', if_exists='append', con=self.database_manager.database.engine, index=False, chunksize=50)
def call(interactions_expanded: pd.DataFrame) -> pd.DataFrame: interactions_expanded.rename(index=str, columns={'name_1': 'multidata_name_1', 'name_2': 'multidata_name_2'}, inplace=True) filters.remove_not_defined_columns(interactions_expanded, ['comments_interaction', 'entry_name_1', 'entry_name_2', 'multidata_name_1', 'multidata_name_2', 'score_1', 'score_2', 'source']) interactions_extended = dataframe_format.bring_columns_to_start( ['multidata_name_1', 'entry_name_1', 'multidata_name_2', 'entry_name_2'], interactions_expanded) return interactions_extended
def _only_uniprots_in_df(uniprots_df, inweb_interactions): inweb_cellphone = pd.merge(inweb_interactions, uniprots_df, left_on=['protein_1'], right_on=['uniprot'], how='inner') remove_not_defined_columns(inweb_cellphone, inweb_interactions.columns.values) inweb_cellphone = pd.merge(inweb_cellphone, uniprots_df, left_on=['protein_2'], right_on=['uniprot'], how='inner') remove_not_defined_columns(inweb_cellphone, inweb_interactions.columns.values) # Prevents duplicated interactions if any uniprot is duplicated in uniprots_df or intaractions inweb_cellphone = inweb_cellphone[inweb_cellphone.duplicated() == False] return remove_not_defined_columns(inweb_cellphone, inweb_interactions.columns.values)
def add(self, genes: pd.DataFrame): query_multidatas = self.database_manager.database.session.query( Protein.id_protein, Multidata.name).join(Multidata) multidatas = pd.read_sql(query_multidatas.statement, self.database_manager.database.session.bind) genes = self._blend_multidata(genes, ['name'], multidatas) genes.rename(index=str, columns={'id_protein': 'protein_id'}, inplace=True) genes = filters.remove_not_defined_columns( genes, self.database_manager.get_column_table_names('gene')) genes.to_sql(name='gene', if_exists='append', con=self.database_manager.database.engine, index=False, chunksize=50)
def add(self, complexes): """ Uploads complex data from csv. - Creates new complexes in Multidata table - Creates reference in Complex table - Creates complex composition to define complexes. """ if complexes.empty: return existing_complexes = self.database_manager.database.session.query(Multidata.name).all() existing_complexes = [c[0] for c in existing_complexes] proteins = self.database_manager.database.session.query(Multidata.name, Multidata.id_multidata).join( Protein).all() proteins = {p[0]: p[1] for p in proteins} # Get complex composition info complete_indices = [] incomplete_indices = [] missing_proteins = [] complex_map = {} for index, row in complexes.iterrows(): missing = False protein_id_list = [] for protein in ['protein_1', 'protein_2', 'protein_3', 'protein_4']: if not pd.isnull(row[protein]): protein_id = proteins.get(row[protein]) if protein_id is None: missing = True missing_proteins.append(row[protein]) else: protein_id_list.append(protein_id) if not missing: complex_map[row['name']] = protein_id_list complete_indices.append(int(index)) else: incomplete_indices.append(index) if len(incomplete_indices) > 0: core_logger.warning('MISSING PROTEINS:') for protein in missing_proteins: core_logger.warning('MISSING PROTEINS:')(protein) core_logger.warning('COMEPLEXES WITH MISSING PROTEINS:') core_logger.warning(complexes.iloc[incomplete_indices, :]['name']) # Insert complexes if not complexes.empty: # Remove unwanted columns removal_columns = list( [x for x in complexes.columns if 'protein_' in x or 'Name_' in x or 'Unnamed' in x]) # removal_columns += ['comments'] complexes.drop(removal_columns, axis=1, inplace=True) # Remove rows with missing complexes complexes = complexes.iloc[complete_indices, :] # Convert ints to bool bools = ['receptor', 'other', 'secreted_highlight', 'transmembrane', 'secreted', 'peripheral'] complexes[bools] = complexes[bools].astype(bool) # Drop existing complexes complexes = complexes[complexes['name'].apply( lambda x: x not in existing_complexes)] multidata_df = filters.remove_not_defined_columns(complexes.copy(), self.database_manager.get_column_table_names( 'multidata_table')) multidata_df = self._add_complex_optimitzations(multidata_df) multidata_df.to_sql(name='multidata_table', if_exists='append', con=self.database_manager.database.engine, index=False, chunksize=50) # Now find id's of new complex rows new_complexes = self.database_manager.database.session.query(Multidata.name, Multidata.id_multidata).all() new_complexes = {c[0]: c[1] for c in new_complexes} # Build set of complexes complex_set = [] complex_table = [] for complex_name in complex_map: complex_id = new_complexes[complex_name] for protein_id in complex_map[complex_name]: complex_set.append((complex_id, protein_id, len(complex_map[complex_name]))) complex_table.append({'complex_multidata_id': complex_id, 'name': complex_name}) # Insert complex composition complex_set_df = pd.DataFrame(complex_set, columns=['complex_multidata_id', 'protein_multidata_id', 'total_protein']) complex_table_df = pd.DataFrame(complex_table) complex_table_df = pd.merge(complex_table_df, complexes, on='name') filters.remove_not_defined_columns(complex_table_df, self.database_manager.get_column_table_names('complex_table')) complex_table_df.to_sql( name='complex_table', if_exists='append', con=self.database_manager.database.engine, index=False, chunksize=50) complex_set_df.to_sql( name='complex_composition_table', if_exists='append', con=self.database_manager.database.engine, index=False, chunksize=50)
def process_protein_data(proteins: pd.DataFrame, protein_columns: list) -> pd.DataFrame: proteins = proteins.copy() proteins = filters.remove_not_defined_columns(proteins, protein_columns + ['name']) return proteins
def process_multidata_data(proteins: pd.DataFrame, multidata_columns: list) -> pd.DataFrame: multidata_proteins = proteins.copy() return filters.remove_not_defined_columns(multidata_proteins, multidata_columns)
def call(genes: pd.DataFrame, gene_columns: list) -> pd.DataFrame: genes.rename(index=str, columns={'uniprot': 'name'}, inplace=True) filters.remove_not_defined_columns(genes, gene_columns + ['name']) return genes