def query(self, query, **params): """ Executes the given SQL query against the connected dsatabase. """ chunksize = params.pop("chunksize", 100000) to_pandas = params.pop("to_pandas", True) with self._cursor() as cursor: params = { k: v for k, v in params.items() if k in getargs(cursor.execute).args } cursor.execute(query, **params) fields = [i[0] for i in cursor.description] res = [] while True: result = cursor.fetchmany(chunksize) if not result: break res.append(Frame(result)) frame = rbind(res, bynames=False) if frame.shape == (0, 0): frame = Frame({n: [] for n in fields}) else: frame.names = fields if to_pandas: frame = frame.to_pandas() return frame
def aggregate(self, n_bins=500, nx_bins=50, ny_bins=50, max_dimensions=50, seed=0): dt_exemplars, dt_members = core.aggregate(self._dt, n_bins, nx_bins, ny_bins, max_dimensions, seed) names_exemplars = self.names + ("count", ) names_members = ("exemplar_id") return Frame(dt_exemplars, names_exemplars), Frame(dt_members, names_members)
def build_drug_synonym_df(drug_file, metadata_dir, output_dir): # Get metadata file and drug_df drug_metadata = get_metadata(drug_file, metadata_dir) drug_df = pd.read_csv(os.path.join(output_dir, 'drug.csv')) # Find all columns relevant to drugid # Right now only FDA col is dropped, but may be more metadata in the future pattern = re.compile('drugid') drug_cols = drug_metadata[[ col for col in drug_metadata.columns if pattern.search(col) ]] # Get all unique synonyms and join with drugs_df drug_synonym_df = melt_and_join(drug_cols, 'unique.drugid', drug_df) drug_synonym_df = drug_synonym_df.rename(columns={ 'id': 'drug_id', 'value': 'drug_name' }) # Add blank col for dataset_id (TODO) drug_synonym_df['dataset_id'] = np.nan # Convert to datatable.Frame for fast write to disk df = Frame(drug_synonym_df) df = write_table(df, 'drug_synonym', output_dir) return df
def build_tissue_synonym_df(tissue_file, metadata_dir, output_dir): # Get metadata file and tissue_df (assume taht tissue_df is also in output_dir) tissue_metadata = get_metadata(tissue_file, metadata_dir) tissue_df = pd.read_csv(os.path.join(output_dir, 'tissue.csv')) # Find all columns relevant to tissueid pattern = re.compile('tissueid') tissue_cols = tissue_metadata[[ col for col in tissue_metadata.columns if pattern.search(col) ]] # Get all unique synonyms and join with tissue_df tissue_synonym_df = melt_and_join(tissue_cols, 'unique.tissueid', tissue_df) tissue_synonym_df = tissue_synonym_df.rename(columns={ 'id': 'tissue_id', 'value': 'tissue_name' }) # Add blank col for dataset_id (TODO) tissue_synonym_df['dataset_id'] = np.nan # Convert to datatable.Frame for fast write to disk df = Frame(tissue_synonym_df) df = write_table(df, 'tissue_synonym', output_dir) return df
def build_cell_synonym_df(cell_file, metadata_dir, output_dir): # Get metadata file and cell_df cell_metadata = get_metadata(cell_file, metadata_dir) cell_df = pd.read_csv(os.path.join(output_dir, 'cell.csv')) # Find all columns relevant to cellid pattern = re.compile('cellid') cell_columns = cell_metadata[[ col for col in cell_metadata.columns if pattern.search(col) ]] # Get all unique synonyms and join with cell_df cell_synonym_df = melt_and_join(cell_columns, 'unique.cellid', cell_df) cell_synonym_df = cell_synonym_df.rename(columns={ 'id': 'cell_id', 'value': 'cell_name' }) # Add blank col for dataset_id (TODO) cell_synonym_df['dataset_id'] = np.nan # Convert to datatable.Frame for fast write to disk df = Frame(cell_synonym_df) df = write_table(df, 'cell_synonym', output_dir) return df
def send_delete(self, df, table: str, mode: str = 'in_set', **params) -> int: """ Delete entries in a table. Use this method instead of send_bulk_query for deleting rows in a table; e.g. instead of looping over a long vector and deleting chunkwise. - df (DataFrame): a data frame containing the information which rows to delete. See mode for details. - table (str): the table where rows are to be deleted - mode (str): - 'in_join': delete entries with a match in df. If it is possible to do a left join with df we have a match. - 'not_in_join': delete entries with **no** match in df. - 'in_set': delete entries which are in the set defined by df. We do a 'where in col from df' and concatenate columns with 'and'. This can bring a considerable speedup, compared to the join strategy, if you only delete values from one enum field. - 'not_in_set': delete entries which are not in the set defined by df. - 'in_delete_col': updates the 'delete column in the target table and then sends a delete statement. This can have more pretictable performance compared to 'in_join'. """ if not isinstance(df, Frame): df = Frame(df) with self.transaction() as conn: return conn.send_delete(df, table, mode, **params)
def get_feature_importance(self) -> Frame: gain_importance = self.model.feature_importance(importance_type='gain') split_importance = self.model.feature_importance( importance_type='split') importance = Frame(variable=self.model.feature_name(), gain=gain_importance, split=split_importance) importance = importance.sort(-f.gain) return importance
def write_pset_table(pset_df, df_name, pset_name, df_dir): """ Write a PSet table to a CSV file. @param pset_df: [`DataFrame`] A PSet DataFrame @param pset_name: [`string`] The name of the PSet @param df_dir: [`string`] The name of the directory to hold all the PSet tables @return [`None`] """ pset_path = os.path.join(df_dir, pset_name) # Make sure directory for this PSet exists if not os.path.exists(pset_path): os.mkdir(pset_path) # Convert to datatable Frame for fast write to disk pset_df = Frame(pset_df) print(f'Writing {df_name} table to {pset_path}...') # Use datatable to convert df to csv pset_df.to_csv(os.path.join(pset_path, f'{pset_name}_{df_name}.csv'))
def send_data(self, df, table, mode='insert', **params): """Sends df to table in database. - df (DataFrame): internally we use datatable Frame. Any object that can be converted to a Frame may be supplied. - table_name (str): Name of the table. - mode ({'insert', 'truncate', 'replace', 'update'}): Mode of Data Insertion. Defaults to 'insert'. - 'insert': appends data. Duplicates in the primary keys are not replaced. - 'truncate': drop the table, recreate it, then insert. No rollback on error. - 'delete': delete all rows in the table, then insert. This operation can be rolled back on error, but can be very expensive. - 'replace': replaces (delete, then insert) duplicate primary keys. - 'update': insert but with update on duplicate primary keys - 'mode_diffs': sync|insert|update|replace_diffs. Instead of sending the complete dataset, first identify the changes and then only send the changes. This works most effectively if you only expect few changes in your data. 'sync' will not only update new rows, but will also delete rows; this has the same effect as a truncate. - keys (str|list[str]|None): defaults to None. Columns to identify unique values and find differences. None is the default and uses all columns. - in_range (str|None): optionally provide a name of a numeric column, e.g. an id. We derive min and max and reduce the amount of data we have to pull down to construct diffs. - chunksize (int): defaults to 10 million. We pull data in chunks and remove duplicates from the dataset. """ if not isinstance(df, Frame): df = Frame(df) with self.transaction() as conn: return conn.send_data(df, table, mode, **params)
dataset = None with Study(TRAIN_PARAMS) as study: if args.model == 'xgboost': optimizer = XGBoostOptimizer(X, y, weights, X_val, y_val, weights_val, 0.05, 0.2) else: optimizer = LightGBMOptimizer(X, y, weights, X_val, y_val, weights_val, 0.05, 0.2) study_importance = study.optimize(optimizer) study.log_csv(study_importance, f'{study.experiment_files_prefix}_study_importance.csv') best_models = optimizer.get_best_models() for model in best_models: importance = model.get_feature_importance() study.log_csv( importance, f'{study.experiment_files_prefix}_{model}_importance.csv') y_pred = model.predict(dapply_kaggle) apply_kaggle = Frame( numero_de_cliente=dapply_kaggle['numero_de_cliente'], estimulo=y_pred > study.best_params['prob_corte']) study.log_csv( apply_kaggle, f'{study.experiment_files_prefix}_{model}_super_kaggle.csv')
def numbers(nrow): "Generate 'nrow' random integers." return [rnd.randint(0, 1) for _ in range(nrow)] def chars(nrow): "Generate 'nrow' random strings." return [''.join(rnd.choices(string.ascii_letters, k=8)) for _ in range(nrow)] DF = Frame( id=range(NROW), char1=chars(NROW), char2=chars(NROW), num1=numbers(NROW), num2=numbers(NROW) ) with mysql.Database(URL) as db: db.send_bulk_query(""" CREATE TABLE test.some_table ( id INTEGER NOT NULL, char1 VARCHAR(8) NOT NULL, char2 VARCHAR(8) NOT NULL, num1 INTEGER NOT NULL, num2 INTEGER NOT NULL, PRIMARY KEY (`id`) )
with Study(TRAIN_PARAMS) as study: if args.model == 'xgboost': optimizer = XGBoostOptimizer(X, y, weights, X_val, y_val, weights_val, 0.05, 0.15) else: optimizer = LightGBMOptimizer(X, y, weights, X_val, y_val, weights_val, 0.05, 0.15) study_importance = study.optimize(optimizer) study.log_csv(study_importance, f'{study.experiment_files_prefix}_study_importance.csv') best_models = optimizer.get_best_models() dataset = dataset_original[:, f[:].remove([f.clase_ternaria])] dapply_stacking = dataset dapply_kaggle = dataset[f.foto_mes == TRAIN_PARAMS['foto_mes_kaggle'], :] for model in best_models: importance = model.get_feature_importance() study.log_csv(importance, f'{study.experiment_files_prefix}_{model}_importance.csv') y_pred = model.predict(dapply_stacking) apply_stacking = Frame(numero_de_cliente=dapply_stacking['numero_de_cliente'], foto_mes=dapply_stacking['foto_mes'], prob=y_pred, estimulo=y_pred > study.best_params['prob_corte']) study.log_csv(apply_stacking, f'{study.experiment_files_prefix}_{model}_stacking_apply.csv') y_pred = model.predict(dapply_kaggle) apply_kaggle = Frame(numero_de_cliente=dapply_kaggle['numero_de_cliente'], estimulo=y_pred > study.best_params['prob_corte']) study.log_csv(apply_kaggle, f'{study.experiment_files_prefix}_{model}_kaggle.csv')
def score(self, actual: np.array, predicted: np.array, sample_weight: typing.Optional[np.array] = None, labels: typing.Optional[List[any]] = None, X: typing.Optional[dt.Frame] = None, **kwargs) -> float: # Get the logger if it exists logger = self.get_experiment_logger() # hard-coded as access to experiment parameters (such as self.tgc) not yet available tgc = ["Store", "Dept"] # tgc = ["state"] # tgc = None # enable weighted average over TS R2 scores: weighted based on TS share of rows isR2AverageWeighted = False # obtain a scorer for metric to use scorer = self.get_scorer() if tgc is None or not all(col in X.names for col in tgc): loggerinfo( logger, f"TS R2 computes single R2 on {X.nrows} rows as either tgc {tgc} is not defined or incorrect." ) return scorer.score(actual, predicted, sample_weight, labels, **kwargs) else: tgc_values = X[:, { "weight": count() / X.nrows, "r2": 0.0 }, by(tgc)] loggerinfo( logger, f"TS R2 computes multiple R2 on {X.nrows} rows, tgc {tgc} with weighting is {isR2AverageWeighted}." ) none_values = [None] * X.nrows X = cbind( X[:, tgc], Frame(actual=actual, predicted=predicted, sample_weight=sample_weight if sample_weight is not None else none_values)) for i in range(0, tgc_values.nrows): current_tgc = tgc_values[i, :] current_tgc.key = tgc ts_frame = X[:, :, join(current_tgc)][~isna(f.r2), :] r2_score = scorer.score( ts_frame['actual'].to_numpy(), ts_frame['predicted'].to_numpy(), ts_frame['sample_weight'].to_numpy() if sample_weight is not None else None, labels, **kwargs) tgc_values[i, f.r2] = r2_score loggerinfo( logger, f"TS R2 = {r2_score} on {ts_frame.nrows} rows, tgc = {current_tgc[0, tgc].to_tuples()}" ) if isR2AverageWeighted: # return np.average(tgc_values["r2"].to_numpy(), weights=tgc_values["weight"].to_numpy()) return tgc_values[:, mean(f.r2 * f.weight)][0, 0] else: return tgc_values[:, mean(f.r2)][0, 0]