def run_general_clustering_pipeline(self): """ Runs data cleaning for general_clustering_pipeline. Args: NA. Returns: validation_flag: Boolean type value indicating if input data is valid or not. message: A message indicates the status of current check. """ if self.user_spreadsheet_df is None: return False, logger.logging # Checks intersection between user spreadsheet data and phenotype data phenotype_df_cleaned = None if self.phenotype_df is not None: phenotype_df_cleaned = CommonUtil.check_phenotype_intersection(self.phenotype_df, self.user_spreadsheet_df.columns.values) if phenotype_df_cleaned is None: logger.logging.append('ERROR: Phenotype is emtpy. Please provide a valid phenotype data.') return False, logger.logging logger.logging.append('INFO: Start to process user spreadsheet data.') # Checks if user spreadsheet contains na value and only real number user_spreadsheet_df_val_check = SpreadSheet.check_user_spreadsheet_data(self.user_spreadsheet_df, dropna_colwise=True, check_real_number=True, check_positive_number=True) if user_spreadsheet_df_val_check is None: return False, logger.logging user_spreadsheet_df_rm_na_header = SpreadSheet.remove_na_header(user_spreadsheet_df_val_check) if user_spreadsheet_df_rm_na_header is None: return False, logger.logging # Removes NA value and duplication on column and row name user_spreadsheet_df_cleaned = SpreadSheet.remove_dataframe_indexer_duplication(user_spreadsheet_df_rm_na_header) if user_spreadsheet_df_cleaned is None: return False, logger.logging IOUtil.write_to_file(user_spreadsheet_df_cleaned, self.run_parameters['spreadsheet_name_full_path'], self.run_parameters['results_directory'], '_ETL.tsv') logger.logging.append( 'INFO: Cleaned user spreadsheet has {} row(s), {} column(s).'.format( user_spreadsheet_df_cleaned.shape[0], user_spreadsheet_df_cleaned.shape[1])) if phenotype_df_cleaned is not None: IOUtil.write_to_file(phenotype_df_cleaned, self.run_parameters['phenotype_name_full_path'], self.run_parameters['results_directory'], '_ETL.tsv') logger.logging.append( 'INFO: Cleaned phenotype data has {} row(s), {} column(s).'.format(phenotype_df_cleaned.shape[0], phenotype_df_cleaned.shape[1])) return True, logger.logging
def run_geneset_characterization_pipeline(self): """ Runs data cleaning for geneset_characterization_pipeline. Args: NA Returns: validation_flag: Boolean type value indicating if input data is valid or not. message: A message indicates the status of current check. """ if self.user_spreadsheet_df is None: return False, logger.logging # Checks only non-negative real number appears in user spreadsheet, drop na column wise user_spreadsheet_val_chked = SpreadSheet.check_user_spreadsheet_data(self.user_spreadsheet_df, check_na=True, check_real_number=True, check_positive_number=True) if user_spreadsheet_val_chked is None: return False, logger.logging # Removes NA value and duplication on column and row name user_spreadsheet_df_checked = SpreadSheet.remove_dataframe_indexer_duplication(user_spreadsheet_val_chked) # Checks the validity of gene name to see if it can be ensemble or not user_spreadsheet_df_cleaned, map_filtered_dedup, mapping = SpreadSheet.map_ensemble_gene_name( user_spreadsheet_df_checked, self.run_parameters) if user_spreadsheet_df_cleaned is None: return False, logger.logging IOUtil.write_to_file(user_spreadsheet_df_cleaned, self.run_parameters['spreadsheet_name_full_path'], self.run_parameters['results_directory'], '_ETL.tsv') # writes dedupped mapping between user_supplied_gene_name and ensemble name to a file IOUtil.write_to_file(map_filtered_dedup, self.run_parameters['spreadsheet_name_full_path'], self.run_parameters['results_directory'], '_MAP.tsv', use_index=True, use_header=False) # writes user supplied gene name along with its mapping status to a file IOUtil.write_to_file(mapping, self.run_parameters['spreadsheet_name_full_path'], self.run_parameters['results_directory'], '_User_To_Ensembl.tsv', use_index=False, use_header=True) logger.logging.append( 'INFO: Cleaned user spreadsheet has {} row(s), {} column(s).'.format( user_spreadsheet_df_cleaned.shape[0], user_spreadsheet_df_cleaned.shape[1])) return True, logger.logging
def run_phenotype_prediction_pipeline(self): """ Runs data cleaning for phenotype_prediction_pipeline. Args: NA. Returns: validation_flag: Boolean type value indicating if input data is valid or not. message: A message indicates the status of current check. """ # spreadsheet dimension: sample x phenotype, phenotype dimension : sample x phenotype if self.user_spreadsheet_df is None or self.phenotype_df is None: return False, logger.logging # Checks if user spreadsheet contains only real number and drop na column wise user_spreadsheet_dropna = SpreadSheet.check_user_spreadsheet_data(self.user_spreadsheet_df, dropna_colwise=True, check_real_number=True) if user_spreadsheet_dropna is None or user_spreadsheet_dropna.empty: logger.logging.append('ERROR: After drop NA, user spreadsheet data becomes empty.') return None, None # Checks for valid intersection between phenotype data and user spreadsheet data dataframe_header = list(user_spreadsheet_dropna.columns.values) phenotype_df_pxs_trimmed = CheckUtil.check_intersection_for_phenotype_and_user_spreadsheet(dataframe_header, self.phenotype_df) # Removes NA value and duplication on column and row name user_spreadsheet_df_cleaned = SpreadSheet.remove_dataframe_indexer_duplication(user_spreadsheet_dropna) if user_spreadsheet_df_cleaned is None or phenotype_df_pxs_trimmed is None: return False, logger.logging # Stores cleaned phenotype data (transposed) to a file, dimension: phenotype x sample IOUtil.write_to_file(phenotype_df_pxs_trimmed, self.run_parameters['phenotype_name_full_path'], self.run_parameters['results_directory'], '_ETL.tsv') IOUtil.write_to_file(user_spreadsheet_df_cleaned, self.run_parameters['spreadsheet_name_full_path'], self.run_parameters['results_directory'], '_ETL.tsv') logger.logging.append( 'INFO: Cleaned user spreadsheet has {} row(s), {} column(s).'.format( user_spreadsheet_df_cleaned.shape[0], user_spreadsheet_df_cleaned.shape[1])) logger.logging.append( 'INFO: Cleaned phenotype data has {} row(s), {} column(s).'.format(phenotype_df_pxs_trimmed.shape[0], phenotype_df_pxs_trimmed.shape[1])) return True, logger.logging
def run_simplified_inpherno_pipeline(self): """ Runs data cleaning for simplified_inpherno_pipeline. Args: NA. Returns: validation_flag: Boolean type value indicating if input data is valid or not. message: A message indicates the status of current check. """ output_files = ['Pvalue_gene_phenotype', 'expression_sample', 'TFexpression'] for file in output_files: if eval(str('self.' + file)) is None: return False, logger.logging for file in output_files: cur_data = eval(str('self.' + file)) if SpreadSheet.check_user_spreadsheet_data(cur_data, check_real_number=True, check_na=True if file is 'TFexpression' else False) is None: return False, logger.logging cur_data_cleaned, mapping_dedup, mapping = SpreadSheet.map_ensemble_gene_name(cur_data, self.run_parameters) if cur_data_cleaned is None: return False, logger.logging IOUtil.write_to_file(cur_data, self.run_parameters[file + '_full_path'], self.run_parameters['results_directory'], '.tsv', use_header=False if file is 'TFexpression' else True) IOUtil.write_to_file(cur_data_cleaned, self.run_parameters[file + '_full_path'], self.run_parameters['results_directory'], '_ETL.tsv', use_header=False if file is 'TFexpression' else True) # writes dedupped mapping between user_supplied_gene_name and ensemble name to a file IOUtil.write_to_file(mapping_dedup, self.run_parameters[file + '_full_path'], self.run_parameters['results_directory'], '_MAP.tsv', use_index=True, use_header=False) # writes user supplied gene name along with its mapping status to a file IOUtil.write_to_file(mapping, self.run_parameters[file + '_full_path'], self.run_parameters['results_directory'], '_User_To_Ensembl.tsv', use_index=False, use_header=True) return True, logger.logging
def run_samples_clustering_pipeline(self): """ Runs data cleaning for samples_clustering_pipeline. Args: NA Returns: validation_flag: Boolean type value indicating if input data is valid or not. message: A message indicates the status of current check. """ if self.user_spreadsheet_df is None: return False, logger.logging logger.logging.append('INFO: Start to process user spreadsheet data.') # Checks if only non-negative real number appears in user spreadsheet and drop na column wise user_spreadsheet_val_chked = SpreadSheet.check_user_spreadsheet_data(self.user_spreadsheet_df, dropna_colwise=True, check_real_number=True, check_positive_number=True) if user_spreadsheet_val_chked is None: return False, logger.logging # Removes NA value and duplication on column and row name user_spreadsheet_df_checked = SpreadSheet.remove_dataframe_indexer_duplication(user_spreadsheet_val_chked) # Checks the validity of gene name to see if it can be ensemble or not user_spreadsheet_df_cleaned, map_filtered_dedup, mapping = SpreadSheet.map_ensemble_gene_name( user_spreadsheet_df_checked, self.run_parameters) if 'gg_network_name_full_path' in self.run_parameters.keys() and \ not CommonUtil.check_network_data_intersection(user_spreadsheet_df_cleaned.index, self.run_parameters): return False, logger.logging # The logic here ensures that even if phenotype data doesn't fits requirement, the rest pipelines can still run. if user_spreadsheet_df_cleaned is None: return False, logger.logging else: IOUtil.write_to_file(user_spreadsheet_df_cleaned, self.run_parameters['spreadsheet_name_full_path'], self.run_parameters['results_directory'], '_ETL.tsv') # writes dedupped mapping between user_supplied_gene_name and ensemble name to a file IOUtil.write_to_file(map_filtered_dedup, self.run_parameters['spreadsheet_name_full_path'], self.run_parameters['results_directory'], '_MAP.tsv', use_index=True, use_header=False) # writes user supplied gene name along with its mapping status to a file IOUtil.write_to_file(mapping, self.run_parameters['spreadsheet_name_full_path'], self.run_parameters['results_directory'], '_User_To_Ensembl.tsv', use_index=False, use_header=True) logger.logging.append( 'INFO: Cleaned user spreadsheet has {} row(s), {} column(s).'.format( user_spreadsheet_df_cleaned.shape[0], user_spreadsheet_df_cleaned.shape[1])) if self.phenotype_df is not None: logger.logging.append('INFO: Start to process phenotype data.') phenotype_df_cleaned = CommonUtil.check_phenotype_intersection(self.phenotype_df, self.user_spreadsheet_df.columns.values) if phenotype_df_cleaned is None: logger.logging.append('ERROR: Phenotype is emtpy. Please provide a valid phenotype data.') return False, logger.logging else: IOUtil.write_to_file(phenotype_df_cleaned, self.run_parameters['phenotype_name_full_path'], self.run_parameters['results_directory'], '_ETL.tsv') logger.logging.append('INFO: Cleaned phenotype data has {} row(s), {} ' 'column(s).'.format(phenotype_df_cleaned.shape[0], phenotype_df_cleaned.shape[1])) return True, logger.logging
def run_signature_analysis_pipeline(self): """ Runs data cleaning for signature_analysis_pipeline. Args: NA. Returns: validation_flag: Boolean type value indicating if input data is valid or not. message: A message indicates the status of current check. """ if self.signature_df is None or self.user_spreadsheet_df is None: return False, logger.logging # Removes NA index for both signature data and user spreadsheet data signature_df = SpreadSheet.remove_na_index(self.signature_df) user_spreadsheet_df = SpreadSheet.remove_na_index(self.user_spreadsheet_df) # Checks if only real number and non-NA value appear in user spreadsheet if SpreadSheet.check_user_spreadsheet_data(user_spreadsheet_df, check_na=True, check_real_number=True, check_positive_number=False) is None: return False, logger.logging # Checks duplicate columns and rows in user spreadsheet data if CheckUtil.check_duplicates(user_spreadsheet_df, check_column=True, check_row=True): logger.logging.append('ERROR: Found duplicates on user spreadsheet data. Rejecting...') return False, logger.logging # Checks intersection of genes between signature data and user spreadsheet data intersection = CheckUtil.find_intersection(signature_df.index, user_spreadsheet_df.index) if intersection is None: logger.logging.append('ERROR: Cannot find intersection between spreadsheet genes and signature genes.') return False, logger.logging logger.logging.append( 'INFO: Found {} intersected gene(s) between phenotype and spreadsheet data.'.format(len(intersection))) # Checks number of unique value in userspread sheet equals to 2 if not SpreadSheet.check_unique_values(user_spreadsheet_df, cnt=2): logger.logging.append( 'ERROR: user spreadsheet data does not meet the requirment of having at least two unique values.') return False, logger.logging # Checks intersection among network data, signature data and user spreadsheet data if 'gg_network_name_full_path' in self.run_parameters.keys() and \ not CommonUtil.check_network_data_intersection(intersection, self.run_parameters): return False, logger.logging # The logic here ensures that even if phenotype data doesn't fits requirement, the rest pipelines can still run. if user_spreadsheet_df is None: return False, logger.logging else: IOUtil.write_to_file(user_spreadsheet_df, self.run_parameters['spreadsheet_name_full_path'], self.run_parameters['results_directory'], '_ETL.tsv') logger.logging.append( 'INFO: Cleaned user spreadsheet has {} row(s), {} column(s).'.format( user_spreadsheet_df.shape[0], user_spreadsheet_df.shape[1])) if signature_df is not None: IOUtil.write_to_file(signature_df, self.run_parameters['signature_name_full_path'], self.run_parameters['results_directory'], '_ETL.tsv') logger.logging.append( 'INFO: Cleaned phenotype data has {} row(s), {} column(s).'.format(signature_df.shape[0], signature_df.shape[1])) return True, logger.logging