예제 #1
0
 def query(self, query, **params):
     """
     Executes the given SQL query against the connected dsatabase.
     """
     chunksize = params.pop("chunksize", 100000)
     to_pandas = params.pop("to_pandas", True)
     with self._cursor() as cursor:
         params = {
             k: v
             for k, v in params.items() if k in getargs(cursor.execute).args
         }
         cursor.execute(query, **params)
         fields = [i[0] for i in cursor.description]
         res = []
         while True:
             result = cursor.fetchmany(chunksize)
             if not result:
                 break
             res.append(Frame(result))
     frame = rbind(res, bynames=False)
     if frame.shape == (0, 0):
         frame = Frame({n: [] for n in fields})
     else:
         frame.names = fields
     if to_pandas:
         frame = frame.to_pandas()
     return frame
예제 #2
0
def aggregate(self,
              n_bins=500,
              nx_bins=50,
              ny_bins=50,
              max_dimensions=50,
              seed=0):
    dt_exemplars, dt_members = core.aggregate(self._dt, n_bins, nx_bins,
                                              ny_bins, max_dimensions, seed)
    names_exemplars = self.names + ("count", )
    names_members = ("exemplar_id")
    return Frame(dt_exemplars, names_exemplars), Frame(dt_members,
                                                       names_members)
예제 #3
0
def build_drug_synonym_df(drug_file, metadata_dir, output_dir):
    # Get metadata file and drug_df
    drug_metadata = get_metadata(drug_file, metadata_dir)
    drug_df = pd.read_csv(os.path.join(output_dir, 'drug.csv'))

    # Find all columns relevant to drugid
    # Right now only FDA col is dropped, but may be more metadata in the future
    pattern = re.compile('drugid')
    drug_cols = drug_metadata[[
        col for col in drug_metadata.columns if pattern.search(col)
    ]]

    # Get all unique synonyms and join with drugs_df
    drug_synonym_df = melt_and_join(drug_cols, 'unique.drugid', drug_df)
    drug_synonym_df = drug_synonym_df.rename(columns={
        'id': 'drug_id',
        'value': 'drug_name'
    })

    # Add blank col for dataset_id (TODO)
    drug_synonym_df['dataset_id'] = np.nan

    # Convert to datatable.Frame for fast write to disk
    df = Frame(drug_synonym_df)
    df = write_table(df, 'drug_synonym', output_dir)
    return df
예제 #4
0
def build_tissue_synonym_df(tissue_file, metadata_dir, output_dir):
    # Get metadata file and tissue_df (assume taht tissue_df is also in output_dir)
    tissue_metadata = get_metadata(tissue_file, metadata_dir)
    tissue_df = pd.read_csv(os.path.join(output_dir, 'tissue.csv'))

    # Find all columns relevant to tissueid
    pattern = re.compile('tissueid')
    tissue_cols = tissue_metadata[[
        col for col in tissue_metadata.columns if pattern.search(col)
    ]]

    # Get all unique synonyms and join with tissue_df
    tissue_synonym_df = melt_and_join(tissue_cols, 'unique.tissueid',
                                      tissue_df)
    tissue_synonym_df = tissue_synonym_df.rename(columns={
        'id': 'tissue_id',
        'value': 'tissue_name'
    })

    # Add blank col for dataset_id (TODO)
    tissue_synonym_df['dataset_id'] = np.nan

    # Convert to datatable.Frame for fast write to disk
    df = Frame(tissue_synonym_df)
    df = write_table(df, 'tissue_synonym', output_dir)
    return df
예제 #5
0
def build_cell_synonym_df(cell_file, metadata_dir, output_dir):
    # Get metadata file and cell_df
    cell_metadata = get_metadata(cell_file, metadata_dir)
    cell_df = pd.read_csv(os.path.join(output_dir, 'cell.csv'))

    # Find all columns relevant to cellid
    pattern = re.compile('cellid')
    cell_columns = cell_metadata[[
        col for col in cell_metadata.columns if pattern.search(col)
    ]]

    # Get all unique synonyms and join with cell_df
    cell_synonym_df = melt_and_join(cell_columns, 'unique.cellid', cell_df)
    cell_synonym_df = cell_synonym_df.rename(columns={
        'id': 'cell_id',
        'value': 'cell_name'
    })

    # Add blank col for dataset_id (TODO)
    cell_synonym_df['dataset_id'] = np.nan

    # Convert to datatable.Frame for fast write to disk
    df = Frame(cell_synonym_df)
    df = write_table(df, 'cell_synonym', output_dir)
    return df
예제 #6
0
    def send_delete(self, df, table: str, mode: str = 'in_set', **params) -> int:
        """
        Delete entries in a table. Use this method instead of send_bulk_query
        for deleting rows in a table; e.g. instead of looping over a long
        vector and deleting chunkwise.

        - df (DataFrame): a data frame containing the information which rows to
          delete. See mode for details.
        - table (str): the table where rows are to be deleted
        - mode (str):
          - 'in_join': delete entries with a match in df. If it is possible to
            do a left join with df we have a match.
          - 'not_in_join': delete entries with **no** match in df.
          - 'in_set': delete entries which are in the set defined by df. We do
            a 'where in col from df' and concatenate columns with 'and'. This
            can bring a considerable speedup, compared to the join strategy, if
            you only delete values from one enum field.
          - 'not_in_set': delete entries which are not in the set defined by
            df.
          - 'in_delete_col': updates the 'delete column in the target table
            and then sends a delete statement. This can have more pretictable
            performance compared to 'in_join'.
        """
        if not isinstance(df, Frame):
            df = Frame(df)
        with self.transaction() as conn:
            return conn.send_delete(df, table, mode, **params)
예제 #7
0
    def get_feature_importance(self) -> Frame:
        gain_importance = self.model.feature_importance(importance_type='gain')
        split_importance = self.model.feature_importance(
            importance_type='split')

        importance = Frame(variable=self.model.feature_name(),
                           gain=gain_importance,
                           split=split_importance)
        importance = importance.sort(-f.gain)

        return importance
예제 #8
0
def write_pset_table(pset_df, df_name, pset_name, df_dir):
    """
    Write a PSet table to a CSV file.

    @param pset_df: [`DataFrame`] A PSet DataFrame
    @param pset_name: [`string`] The name of the PSet
    @param df_dir: [`string`] The name of the directory to hold all the PSet tables
    @return [`None`]
    """
    pset_path = os.path.join(df_dir, pset_name)
    # Make sure directory for this PSet exists
    if not os.path.exists(pset_path):
        os.mkdir(pset_path)

    # Convert to datatable Frame for fast write to disk
    pset_df = Frame(pset_df)

    print(f'Writing {df_name} table to {pset_path}...')
    # Use datatable to convert df to csv
    pset_df.to_csv(os.path.join(pset_path, f'{pset_name}_{df_name}.csv'))
예제 #9
0
    def send_data(self, df, table, mode='insert', **params):
        """Sends df to table in database.

        - df (DataFrame): internally we use datatable Frame. Any object
        that can be converted to a Frame may be supplied.
        - table_name (str): Name of the table.
        - mode ({'insert', 'truncate', 'replace',
        'update'}): Mode of Data Insertion. Defaults to 'insert'.
            - 'insert': appends data. Duplicates in the
            primary keys are not replaced.
            - 'truncate': drop the table, recreate it, then insert. No
            rollback on error.
            - 'delete': delete all rows in the table, then insert. This
            operation can be rolled back on error, but can be very
            expensive.
            - 'replace': replaces (delete, then insert) duplicate primary
            keys.
            - 'update': insert but with update on duplicate primary keys
            - 'mode_diffs': sync|insert|update|replace_diffs. Instead of
              sending the complete dataset, first identify the changes and then
              only send the changes. This works most effectively if you only
              expect few changes in your data. 'sync' will not only update new
              rows, but will also delete rows; this has the same effect as a
              truncate.
              - keys (str|list[str]|None): defaults to None. Columns to
                identify unique values and find differences. None is the
                default and uses all columns.
              - in_range (str|None): optionally provide a name of a
                numeric column, e.g. an id. We derive min and max and reduce
                the amount of data we have to pull down to construct diffs.
              - chunksize (int): defaults to 10 million. We pull data in chunks
                and remove duplicates from the dataset.
        """
        if not isinstance(df, Frame):
            df = Frame(df)
        with self.transaction() as conn:
            return conn.send_data(df, table, mode, **params)
예제 #10
0
    dataset = None

    with Study(TRAIN_PARAMS) as study:
        if args.model == 'xgboost':
            optimizer = XGBoostOptimizer(X, y, weights, X_val, y_val,
                                         weights_val, 0.05, 0.2)
        else:
            optimizer = LightGBMOptimizer(X, y, weights, X_val, y_val,
                                          weights_val, 0.05, 0.2)

        study_importance = study.optimize(optimizer)
        study.log_csv(study_importance,
                      f'{study.experiment_files_prefix}_study_importance.csv')

        best_models = optimizer.get_best_models()

        for model in best_models:
            importance = model.get_feature_importance()
            study.log_csv(
                importance,
                f'{study.experiment_files_prefix}_{model}_importance.csv')

            y_pred = model.predict(dapply_kaggle)
            apply_kaggle = Frame(
                numero_de_cliente=dapply_kaggle['numero_de_cliente'],
                estimulo=y_pred > study.best_params['prob_corte'])
            study.log_csv(
                apply_kaggle,
                f'{study.experiment_files_prefix}_{model}_super_kaggle.csv')
예제 #11
0
def numbers(nrow):
    "Generate 'nrow' random integers."
    return [rnd.randint(0, 1) for _ in range(nrow)]


def chars(nrow):
    "Generate 'nrow' random strings."
    return [''.join(rnd.choices(string.ascii_letters, k=8))
            for _ in range(nrow)]


DF = Frame(
    id=range(NROW),
    char1=chars(NROW),
    char2=chars(NROW),
    num1=numbers(NROW),
    num2=numbers(NROW)
)

with mysql.Database(URL) as db:
    db.send_bulk_query("""
    CREATE TABLE test.some_table
    (
        id INTEGER NOT NULL,
        char1 VARCHAR(8) NOT NULL,
        char2 VARCHAR(8) NOT NULL,
        num1 INTEGER NOT NULL,
        num2 INTEGER NOT NULL,
        PRIMARY KEY (`id`)
    )
    with Study(TRAIN_PARAMS) as study:
        if args.model == 'xgboost':
            optimizer = XGBoostOptimizer(X, y, weights, X_val, y_val, weights_val, 0.05, 0.15)
        else:
            optimizer = LightGBMOptimizer(X, y, weights, X_val, y_val, weights_val, 0.05, 0.15)

        study_importance = study.optimize(optimizer)
        study.log_csv(study_importance, f'{study.experiment_files_prefix}_study_importance.csv')

        best_models = optimizer.get_best_models()

        dataset = dataset_original[:, f[:].remove([f.clase_ternaria])]
        dapply_stacking = dataset
        dapply_kaggle = dataset[f.foto_mes == TRAIN_PARAMS['foto_mes_kaggle'], :]

        for model in best_models:
            importance = model.get_feature_importance()
            study.log_csv(importance, f'{study.experiment_files_prefix}_{model}_importance.csv')

            y_pred = model.predict(dapply_stacking)
            apply_stacking = Frame(numero_de_cliente=dapply_stacking['numero_de_cliente'],
                                   foto_mes=dapply_stacking['foto_mes'],
                                   prob=y_pred,
                                   estimulo=y_pred > study.best_params['prob_corte'])
            study.log_csv(apply_stacking, f'{study.experiment_files_prefix}_{model}_stacking_apply.csv')

            y_pred = model.predict(dapply_kaggle)
            apply_kaggle = Frame(numero_de_cliente=dapply_kaggle['numero_de_cliente'],
                                 estimulo=y_pred > study.best_params['prob_corte'])
            study.log_csv(apply_kaggle, f'{study.experiment_files_prefix}_{model}_kaggle.csv')
    def score(self,
              actual: np.array,
              predicted: np.array,
              sample_weight: typing.Optional[np.array] = None,
              labels: typing.Optional[List[any]] = None,
              X: typing.Optional[dt.Frame] = None,
              **kwargs) -> float:

        # Get the logger if it exists
        logger = self.get_experiment_logger()

        # hard-coded as access to experiment parameters (such as self.tgc) not yet available
        tgc = ["Store", "Dept"]
        # tgc = ["state"]
        # tgc = None

        # enable weighted average over TS R2 scores: weighted based on TS share of rows
        isR2AverageWeighted = False

        # obtain a scorer for metric to use
        scorer = self.get_scorer()

        if tgc is None or not all(col in X.names for col in tgc):
            loggerinfo(
                logger,
                f"TS R2 computes single R2 on {X.nrows} rows as either tgc {tgc} is not defined or incorrect."
            )
            return scorer.score(actual, predicted, sample_weight, labels,
                                **kwargs)
        else:
            tgc_values = X[:, {
                "weight": count() / X.nrows,
                "r2": 0.0
            }, by(tgc)]
            loggerinfo(
                logger,
                f"TS R2 computes multiple R2 on {X.nrows} rows, tgc {tgc} with weighting is {isR2AverageWeighted}."
            )
            none_values = [None] * X.nrows
            X = cbind(
                X[:, tgc],
                Frame(actual=actual,
                      predicted=predicted,
                      sample_weight=sample_weight
                      if sample_weight is not None else none_values))

            for i in range(0, tgc_values.nrows):
                current_tgc = tgc_values[i, :]
                current_tgc.key = tgc
                ts_frame = X[:, :, join(current_tgc)][~isna(f.r2), :]
                r2_score = scorer.score(
                    ts_frame['actual'].to_numpy(),
                    ts_frame['predicted'].to_numpy(),
                    ts_frame['sample_weight'].to_numpy()
                    if sample_weight is not None else None, labels, **kwargs)
                tgc_values[i, f.r2] = r2_score

                loggerinfo(
                    logger,
                    f"TS R2 = {r2_score} on {ts_frame.nrows} rows, tgc = {current_tgc[0, tgc].to_tuples()}"
                )

            if isR2AverageWeighted:
                # return np.average(tgc_values["r2"].to_numpy(), weights=tgc_values["weight"].to_numpy())
                return tgc_values[:, mean(f.r2 * f.weight)][0, 0]
            else:
                return tgc_values[:, mean(f.r2)][0, 0]