def read_file(self): # # Method to read all the input files # # save the start time rf_start_time = time.time() # Read the input file if present if os.path.isfile(self.input_file_path): if self.test_run: # Read a test dataset, so only the first 150 columns and rows self.input_data = pd.read_csv(self.input_file_path, sep="\t", index_col=0, nrows=150) self.input_data = self.input_data.iloc[:150, :100] else: # Check if only a part of the rows must be loaded instead # of the complete dataset n_rows = None if self.n_rows != None and self.n_rows != '': n_rows = int(self.n_rows) # read the input file, This can be a (cahsed) Pandas pickle # file or a tab seperated text file which can be compressed. # If Force is set to true, the method will not read cashed # pickle files created from the original txt matrix if these # are present. By default the cashed version will be # loaded (which contain the suffix _cashed.pickle) if this file # is present in the same directory as the input matrix self.input_data = read_pd_df( self.input_file_path, { "sep": "\t", "index_col": 0, "nrows": n_rows }, force=self.force) else: raise FileNotFoundError("Cannot find input file: {}".format( self.input_file_path )) # if the number of components is not set, we will set this to # the smallest direction of the input matrix if self.n_components is None: self.n_components = np.min(self.input_data.shape) # log some basic info logging_print(stats_dict_to_string({ "Input dataframe n_row": self.input_data.shape[0], "Input dataframe n_col": self.input_data.shape[1], "first column headers": self.input_data.columns.values[:5], "first row index": self.input_data.index.values[:5] })) timer_print(rf_start_time, prefix="Reading input file ready", time_overview_log=self.process_time_overview)
def fit_auto_white(self, data): logging_print("Use fastICA with auto whiten") fastICA_object = FastICA(n_components=self.n_components, algorithm="parallel", fun='logcosh', max_iter=500, tol=1e-10) sources = fastICA_object.fit_transform(data) self.projected_data = sources self.components = fastICA_object.components_ logging_print( stats_dict_to_string( {"Number of used iterations": fastICA_object.n_iter_}))
def fit(self, data): # Method to fit the FastICA models # Check if the whitening step is already done if self.whiten_components is None: # Perfrom the whitening step self.perform_data_whitening(data) fit_start_time = time.time() # Create the FastICA object from sklearn without performing # the whiten step fastICA_object = FastICA(algorithm="parallel", whiten=False, fun='logcosh', max_iter=self.max_iter, tol=1e-10) # Fit the model fastICA_object.fit(self.whiten_data[:, :self.n_components]) # Calculate the independend components and the sources and save # the results indep_comp = np.dot(fastICA_object.components_, self.whiten_components[:self.n_components, :]) indep_sources = np.dot(indep_comp, data.to_numpy().T).T components_index = pd.RangeIndex(start=1, stop=self.n_components + 1, name="IC") indep_sources_df = pd.DataFrame( indep_sources, index=data.index, columns=components_index).add_prefix("IC_") indep_comp_df = pd.DataFrame( indep_comp, index=components_index, columns=data.columns).T.add_prefix("IC_").T self.projected_data = indep_sources_df self.components = indep_comp_df logging_print( stats_dict_to_string( {"Number of used iterations": fastICA_object.n_iter_})) timer_print(fit_start_time, prefix="FastICA component optimalisation is ready")
def write_base_loginfo(self): logging_print( stats_dict_to_string({ "## START DECOMPOSITION ##": "", "DATE": datetime.now(), "Input file": self.input_file_path, "Output dir": self.output_dir, "Analysis type": self.analysis_type, "Over samples": self.over_samples, "Test run": self.test_run, "FastICA max iter": self.fastICA_max_iter, "Number of components": self.n_components, "Number of rows": self.n_rows, "Perform log2 transformation": self.perform_log2, "Perform centering and scaling": self.pre_processing_center_scale, "Force": self.force }))
def read_file(self): rf_start_time = time.time() if os.path.isfile(self.input_file_path): if self.test_run: self.input_data = pd.read_csv(self.input_file_path, sep="\t", index_col=0, nrows=150) self.input_data = self.input_data.iloc[:150, :100] else: n_rows = None if self.n_rows != None and self.n_rows != '': n_rows = int(self.n_rows) self.input_data = read_pd_df(self.input_file_path, { "sep": "\t", "index_col": 0, "nrows": n_rows }, force=self.force) else: raise FileNotFoundError("Cannot find input file: {}".format( self.input_file_path)) if self.n_components is None: self.n_components = np.min(self.input_data.shape) logging_print( stats_dict_to_string({ "Input dataframe n_row": self.input_data.shape[0], "Input dataframe n_col": self.input_data.shape[1], "first column headers": self.input_data.columns.values[:5], "first row index": self.input_data.index.values[:5] })) timer_print(rf_start_time, prefix="Reading input file ready", time_overview_log=self.process_time_overview)
def fit_auto_white(self, data): # Method to fiy the FastICA model without the manual whitening logging_print("Use fastICA with auto whiten") # Use the sklearn implementation to perform FastICA inclusive # the whitening step fastICA_object = FastICA(n_components=self.n_components, algorithm="parallel", fun='logcosh', max_iter=500, tol=1e-10) # Fit the model sources = fastICA_object.fit_transform(data) # Save the data self.projected_data = sources self.components = fastICA_object.components_ logging_print( stats_dict_to_string( {"Number of used iterations": fastICA_object.n_iter_}))
def fit(self, data): if self.whiten_components is None: self.perform_data_whitening(data) fit_start_time = time.time() fastICA_object = FastICA(algorithm="parallel", whiten=False, fun='logcosh', max_iter=self.max_iter, tol=1e-10) fastICA_object.fit(self.whiten_data[:, :self.n_components]) indep_comp = np.dot(fastICA_object.components_, self.whiten_components[:self.n_components, :]) indep_sources = np.dot(indep_comp, data.to_numpy().T).T components_index = pd.RangeIndex(start=1, stop=self.n_components + 1, name="IC") indep_sources_df = pd.DataFrame( indep_sources, index=data.index, columns=components_index).add_prefix("IC_") indep_comp_df = pd.DataFrame( indep_comp, index=components_index, columns=data.columns).T.add_prefix("IC_").T self.projected_data = indep_sources_df self.components = indep_comp_df logging_print( stats_dict_to_string( {"Number of used iterations": fastICA_object.n_iter_})) timer_print(fit_start_time, prefix="FastICA component optimalisation is ready")