def load_full(self):
     '''
     full_data : [train, validate, test]の呼び出し
     '''
     train_df = pd.read_csv(
         self.dataset_path / self.valid_dir / "train.csv",
         dtype=get_csv_dtype(),
         usecols=["MachineIdentifier"] + self.get_loadcols())
     validate_df = pd.read_csv(
         self.dataset_path / self.valid_dir / "validate.csv",
         dtype=get_csv_dtype(),
         usecols=["MachineIdentifier"] + self.get_loadcols())
     test_df = pd.read_csv(self.dataset_path / "test.csv",
                           dtype=get_csv_dtype(),
                           usecols=["MachineIdentifier"] +
                           self.get_loadcols())
     return pd.concat([train_df, validate_df, test_df])
 def load_part(self):
     '''
     part_data : train, validate, testのいずれかの呼び出し
     '''
     part_df = pd.read_csv(self.input_path,
                           dtype=get_csv_dtype(),
                           usecols=["MachineIdentifier"] +
                           self.get_loadcols())
     return part_df
    def CountEncoding(self, df):
        feature_col = df.columns.tolist()[1]

        value_counts = dd.read_csv(
            self.dataset_path / self.valid_dir / "train.csv",
            dtype=get_csv_dtype(),
            usecols=[feature_col]).compute()[feature_col].value_counts()

        df[feature_col] = df[feature_col].map(value_counts.to_dict()).fillna(1)
        return df
    def TargetEncoding(self, df):
        cols = df.columns.tolist()
        train_df = pd.read_csv(self.dataset_path / self.valid_dir /
                               "train.csv",
                               dtype=get_csv_dtype(),
                               usecols=[cols[1]] + ["HasDetections"])
        mean_df = train_df.groupby(cols[1]).mean()["HasDetections"]

        df[cols[1]] = df[cols[1]].map(mean_df.to_dict()).fillna(1 /
                                                                len(train_df))
        return df
        def calcFratures(name, formula):
            if dask_mode():
                part_df = dd.read_csv(self.input_path,
                                      dtype=get_csv_dtype(),
                                      usecols=["MachineIdentifier"] +
                                      self.get_loadcols()).compute()
            else:
                part_df = pd.read_csv(self.input_path,
                                      dtype=get_csv_dtype(),
                                      usecols=["MachineIdentifier"] +
                                      self.get_loadcols())
            if len(formula) == 1:
                new_feature_df = eval(formula[0], locals())
            elif len(formula) == 2:
                new_feature_df = eval(formula[0], locals(), formula[1])
            calc_back_checker(new_feature_df,
                              set(part_df["MachineIdentifier"].values))
            del part_df
            gc.collect()

            back_col_name = [
                n for n in new_feature_df.columns if n != "MachineIdentifier"
            ][0]
            new_feature_df = new_feature_df.rename(
                columns={back_col_name: name})
            new_feature_df = new_feature_df.loc[:, ["MachineIdentifier", name]]
            new_feature_df.sort_values("MachineIdentifier", inplace=True)
            new_feature_df.reset_index(drop=True, inplace=True)

            self.lock.acquire()
            try:
                stdout.write(
                    "\r[Group] {:<15} >> multiprocessing : Done {:>3} out of {:>3} latest {:<100}"
                    .format(self.__class__.__name__,
                            str(self.check_job_progress()),
                            str(len(needCalculationFeatures)),
                            "\"" + name + "\""))
            finally:
                self.lock.release()

            return new_feature_df
    def FactorizeEncoding(self, df):
        cols = df.columns.tolist()
        train_df = pd.read_csv(self.dataset_path / self.valid_dir /
                               "train.csv",
                               dtype=get_csv_dtype(),
                               usecols=cols)
        train_df.sort_values("MachineIdentifier", inplace=True)

        labels, uniques = pd.factorize(train_df[cols[1]])
        df[cols[1]] = uniques.get_indexer(df[cols[1]])

        return df
    def RankEncoding(self, df):
        feature_col = df.columns.tolist()[1]

        value_counts = dd.read_csv(
            self.dataset_path / self.valid_dir / "train.csv",
            dtype=get_csv_dtype(),
            usecols=[feature_col]).compute()[feature_col].value_counts()
        rank_counts = value_counts.rank(method="min", ascending=False)

        df[feature_col] = df[feature_col].map(rank_counts.to_dict()).fillna(
            len(rank_counts))
        return df
    def FrequencyEncoding(self, df):
        feature_col = df.columns.tolist()[1]

        train_df = pd.read_csv(self.dataset_path / self.valid_dir /
                               "train.csv",
                               dtype=get_csv_dtype(),
                               usecols=[feature_col])
        value_counts = train_df[feature_col].value_counts()
        value_freq = value_counts / len(train_df)

        df[feature_col] = df[feature_col].map(value_freq.to_dict()).fillna(
            1 / len(train_df))
        return df
예제 #9
0
    def LabelEncoding(self, df):
        cols = df.columns.tolist()
        another_part = "test" if self.part == "train" else "train"

        part_df = dd.from_pandas(df, 2)
        another_df = dd.read_csv(self.input_path / "{}.csv".format(another_part),
                                 dtype=get_csv_dtype(),
                                 usecols=cols)
        full_df = dd.concat([part_df, another_df]).compute()

        full_df.sort_values("MachineIdentifier", inplace=True)
        labels, uniques = pd.factorize(full_df[cols[1]])
        full_df[cols[1]] = labels

        M_id = df["MachineIdentifier"].values.tolist()
        back_df = full_df[full_df["MachineIdentifier"].isin(M_id)]
        return back_df
    def LabelEncoding(self, df):
        cols = df.columns.tolist()

        full_df = None
        for part in ["train", "validate", "test"]:
            if part == "test":
                part_path = self.dataset_path / "test.csv"
            else:
                part_path = self.dataset_path / self.valid_dir / "{}.csv".format(
                    part)
            another_df = pd.read_csv(part_path,
                                     dtype=get_csv_dtype(),
                                     usecols=cols)
            full_df = another_df if full_df is None else pd.concat(
                [full_df, another_df])
        full_df.sort_values("MachineIdentifier", inplace=True)
        labels, uniques = pd.factorize(full_df[cols[1]])
        full_df[cols[1]] = labels

        M_id = df["MachineIdentifier"].values.tolist()
        back_df = full_df[full_df["MachineIdentifier"].isin(M_id)]
        return back_df
 def load_train(self):
     train_df = pd.read_csv(
         self.dataset_path / self.valid_dir / "train.csv",
         dtype=get_csv_dtype(),
         usecols=["MachineIdentifier"] + self.get_loadcols())
     return train_df