def read_file(self):
        #
        # Method to read all the input files
        #

        # save the start time
        rf_start_time = time.time()

        # Read the input file if present
        if os.path.isfile(self.input_file_path):
            if self.test_run:
                # Read a test dataset, so only the first 150 columns and rows
                self.input_data = pd.read_csv(self.input_file_path,
                                              sep="\t", index_col=0,
                                              nrows=150)
                self.input_data = self.input_data.iloc[:150, :100]
            else:
                # Check if only a part of the rows must be loaded instead
                # of the complete dataset
                n_rows = None
                if self.n_rows != None and self.n_rows != '':
                    n_rows = int(self.n_rows)

                # read the input file, This can be a (cahsed) Pandas pickle
                # file or a tab seperated text file which can be compressed.
                # If Force is set to true, the method will not read cashed
                # pickle files created from the original txt matrix if these
                # are present. By default the cashed version will be
                # loaded (which contain the suffix _cashed.pickle) if this file
                # is present in the same directory as the input matrix

                self.input_data = read_pd_df(
                    self.input_file_path,
                    {
                        "sep": "\t",
                        "index_col": 0,
                        "nrows": n_rows
                    },
                    force=self.force)
        else:
            raise FileNotFoundError("Cannot find input file: {}".format(
                self.input_file_path
            ))

        # if the number of components is not set, we will set this to
        # the smallest direction of the input matrix
        if self.n_components is None:
            self.n_components = np.min(self.input_data.shape)

        # log some basic info
        logging_print(stats_dict_to_string({
            "Input dataframe n_row": self.input_data.shape[0],
            "Input dataframe n_col": self.input_data.shape[1],
            "first column headers": self.input_data.columns.values[:5],
            "first row index": self.input_data.index.values[:5]
        }))
        timer_print(rf_start_time,
                    prefix="Reading input file ready",
                    time_overview_log=self.process_time_overview)
示例#2
0
    def fit_auto_white(self, data):
        logging_print("Use fastICA with auto whiten")
        fastICA_object = FastICA(n_components=self.n_components,
                                 algorithm="parallel",
                                 fun='logcosh',
                                 max_iter=500,
                                 tol=1e-10)

        sources = fastICA_object.fit_transform(data)
        self.projected_data = sources
        self.components = fastICA_object.components_

        logging_print(
            stats_dict_to_string(
                {"Number of used iterations": fastICA_object.n_iter_}))
    def fit(self, data):
        # Method to fit the FastICA models

        # Check if the whitening step is already done
        if self.whiten_components is None:
            # Perfrom the whitening step
            self.perform_data_whitening(data)
        fit_start_time = time.time()

        # Create the FastICA object from sklearn without performing
        # the whiten step
        fastICA_object = FastICA(algorithm="parallel",
                                 whiten=False,
                                 fun='logcosh',
                                 max_iter=self.max_iter,
                                 tol=1e-10)
        # Fit the model
        fastICA_object.fit(self.whiten_data[:, :self.n_components])

        # Calculate the independend components and the sources and save
        # the results
        indep_comp = np.dot(fastICA_object.components_,
                            self.whiten_components[:self.n_components, :])
        indep_sources = np.dot(indep_comp, data.to_numpy().T).T

        components_index = pd.RangeIndex(start=1,
                                         stop=self.n_components + 1,
                                         name="IC")

        indep_sources_df = pd.DataFrame(
            indep_sources, index=data.index,
            columns=components_index).add_prefix("IC_")

        indep_comp_df = pd.DataFrame(
            indep_comp, index=components_index,
            columns=data.columns).T.add_prefix("IC_").T

        self.projected_data = indep_sources_df
        self.components = indep_comp_df

        logging_print(
            stats_dict_to_string(
                {"Number of used iterations": fastICA_object.n_iter_}))
        timer_print(fit_start_time,
                    prefix="FastICA component optimalisation is ready")
示例#4
0
 def write_base_loginfo(self):
     logging_print(
         stats_dict_to_string({
             "## START DECOMPOSITION ##": "",
             "DATE": datetime.now(),
             "Input file": self.input_file_path,
             "Output dir": self.output_dir,
             "Analysis type": self.analysis_type,
             "Over samples": self.over_samples,
             "Test run": self.test_run,
             "FastICA max iter": self.fastICA_max_iter,
             "Number of components": self.n_components,
             "Number of rows": self.n_rows,
             "Perform log2 transformation": self.perform_log2,
             "Perform centering and scaling":
             self.pre_processing_center_scale,
             "Force": self.force
         }))
示例#5
0
    def read_file(self):
        rf_start_time = time.time()

        if os.path.isfile(self.input_file_path):
            if self.test_run:
                self.input_data = pd.read_csv(self.input_file_path,
                                              sep="\t",
                                              index_col=0,
                                              nrows=150)
                self.input_data = self.input_data.iloc[:150, :100]
            else:
                n_rows = None
                if self.n_rows != None and self.n_rows != '':
                    n_rows = int(self.n_rows)
                self.input_data = read_pd_df(self.input_file_path, {
                    "sep": "\t",
                    "index_col": 0,
                    "nrows": n_rows
                },
                                             force=self.force)
        else:
            raise FileNotFoundError("Cannot find input file: {}".format(
                self.input_file_path))

        if self.n_components is None:
            self.n_components = np.min(self.input_data.shape)

        logging_print(
            stats_dict_to_string({
                "Input dataframe n_row":
                self.input_data.shape[0],
                "Input dataframe n_col":
                self.input_data.shape[1],
                "first column headers":
                self.input_data.columns.values[:5],
                "first row index":
                self.input_data.index.values[:5]
            }))
        timer_print(rf_start_time,
                    prefix="Reading input file ready",
                    time_overview_log=self.process_time_overview)
    def fit_auto_white(self, data):
        # Method to fiy the FastICA model without the manual whitening
        logging_print("Use fastICA with auto whiten")

        # Use the sklearn implementation to perform FastICA inclusive
        # the whitening step
        fastICA_object = FastICA(n_components=self.n_components,
                                 algorithm="parallel",
                                 fun='logcosh',
                                 max_iter=500,
                                 tol=1e-10)
        # Fit the model
        sources = fastICA_object.fit_transform(data)

        # Save the data
        self.projected_data = sources
        self.components = fastICA_object.components_

        logging_print(
            stats_dict_to_string(
                {"Number of used iterations": fastICA_object.n_iter_}))
示例#7
0
    def fit(self, data):
        if self.whiten_components is None:
            self.perform_data_whitening(data)
        fit_start_time = time.time()

        fastICA_object = FastICA(algorithm="parallel",
                                 whiten=False,
                                 fun='logcosh',
                                 max_iter=self.max_iter,
                                 tol=1e-10)

        fastICA_object.fit(self.whiten_data[:, :self.n_components])

        indep_comp = np.dot(fastICA_object.components_,
                            self.whiten_components[:self.n_components, :])
        indep_sources = np.dot(indep_comp, data.to_numpy().T).T

        components_index = pd.RangeIndex(start=1,
                                         stop=self.n_components + 1,
                                         name="IC")

        indep_sources_df = pd.DataFrame(
            indep_sources, index=data.index,
            columns=components_index).add_prefix("IC_")

        indep_comp_df = pd.DataFrame(
            indep_comp, index=components_index,
            columns=data.columns).T.add_prefix("IC_").T

        self.projected_data = indep_sources_df
        self.components = indep_comp_df

        logging_print(
            stats_dict_to_string(
                {"Number of used iterations": fastICA_object.n_iter_}))
        timer_print(fit_start_time,
                    prefix="FastICA component optimalisation is ready")