def insert_parasites(raw_data, to_taxon_id): """Insert parasites.""" log(f'Inserting {DATASET_ID} parasites') parasite_records = [] for _, row in raw_data.iterrows(): parasite_records.append(parasite_record( row, to_taxon_id['lice'], 'lice_total', ['Ecomorph Notes', 'lice ages'])) parasite_records.append(parasite_record( row, to_taxon_id['feather_mites'], 'feather_mites_total')) parasite_records.append(parasite_record( row, to_taxon_id['ticks'], 'ticks_total')) parasite_records.append(parasite_record( row, to_taxon_id['flies'], 'flies_total')) parasite_records.append( parasite_record(row, to_taxon_id['fleas'], 'fleas_total')) parasite_records.append( parasite_record(row, to_taxon_id['others'], 'others')) parasites = pd.DataFrame(parasite_records) parasites['parasite_id'] = db.get_ids(parasites, 'parasites') parasites.loc[:, db.PARASITE_COLUMNS].to_sql( 'parasites', db.connect(), if_exists='append', index=False)
def transform_datetime(df: pd.DataFrame, config: Config): date_parts = ["year", "weekday", "month", "day", "hour"] if "date_columns" not in config: config["date_columns"] = {} for c in [c for c in df if c.startswith("datetime_")]: config["date_columns"][c] = [] for part in date_parts: part_col = c + "_" + part df[part_col] = getattr(df[c].dt, part).astype(np.uint16 if part == "year" else np.uint8).values if not (df[part_col] != df[part_col].iloc[0]).any(): log(part_col + " is constant") df.drop(part_col, axis=1, inplace=True) else: config["date_columns"][c].append(part) df.drop(c, axis=1, inplace=True) else: for c, parts in config["date_columns"].items(): for part in parts: part_col = c + "_" + part df[part_col] = getattr(df[c].dt, part) df.drop(c, axis=1, inplace=True)
def delete_dataset(dataset_id): """Clear dataset from the database.""" log(f'Deleting old {dataset_id} records') cxn = connect() cxn.execute('DELETE FROM datasets WHERE dataset_id = ?', (dataset_id, )) sql = """DELETE FROM sites WHERE dataset_id NOT IN (SELECT dataset_id FROM datasets)""" cxn.execute(sql) sql = """DELETE FROM hosts WHERE site_id NOT IN (SELECT site_id FROM sites)""" cxn.execute(sql) sql = """DELETE FROM samples WHERE host_id NOT IN (SELECT host_id FROM hosts)""" cxn.execute(sql) sql = """DELETE FROM parasite_groups WHERE sample_id NOT IN (SELECT sample_id FROM samples)""" cxn.execute(sql) sql = """DELETE FROM parasites WHERE parasite_group_id NOT IN (SELECT parasite_group_id FROM parasite_groups)""" cxn.execute(sql) cxn.commit()
def hyperopt_lightgbm(X: pd.DataFrame, y: pd.Series, params: Dict, config: Config): X_train, X_val, y_train, y_val = data_split(X, y, test_size=0.5) train_data = lgb.Dataset(X_train, label=y_train) valid_data = lgb.Dataset(X_val, label=y_val) space = { "learning_rate": hp.uniform("learning_rate", 0.01, 0.05), "max_depth": hp.choice("max_depth", [-1, 4, 6, 10, 16]), "num_leaves": hp.choice("num_leaves", np.linspace(10, 200, 50, dtype=int)), "feature_fraction": hp.quniform("feature_fraction", 0.5, 1.0, 0.1), "bagging_fraction": hp.quniform("bagging_fraction", 0.5, 1.0, 0.1), "bagging_freq": hp.choice("bagging_freq", np.linspace(0, 50, 10, dtype=int)), "reg_alpha": hp.uniform("reg_alpha", 0, 30), "reg_lambda": hp.uniform("reg_lambda", 0, 30), "min_child_weight": hp.uniform('min_child_weight', 0.5, 50), } def objective(hyperparams): model = lgb.train({**params, **hyperparams}, train_data, 300, valid_data, early_stopping_rounds=100, verbose_eval=100) score = model.best_score["valid_0"][params["metric"]] if config.is_classification(): score = -score return {'loss': score, 'status': STATUS_OK} trials = Trials() best = hyperopt.fmin(fn=objective, space=space, trials=trials, algo=tpe.suggest, max_evals=50, verbose=1, rstate=np.random.RandomState(1)) hyperparams = space_eval(space, best) log("{:0.4f} {}".format(trials.best_trial['result']['loss'], hyperparams)) return hyperparams
def _get_raw(cls, resource, part, max_results=None, **kwargs): if max_results is not None and max_results < cls._max_results_per_request: max_results_per_request = max_results else: max_results_per_request = cls._max_results_per_request results = [] request = resource.list(part=part, maxResults=max_results_per_request, **kwargs) while request and (max_results is None or len(results) < max_results): util.log('Requesting {} ...', request.uri) try: response = request.execute() except googleapiclient.http.HttpError as e: # The HttpError class is currently broken and does not decode the received data before parsing it. if isinstance(e.content, bytes): e.content = e.content.decode() raise results.extend(map(_Item.wrap_json, response.get('items', []))) request = resource.list_next(request, response) return results[:max_results]
def rename_id_columns(df: pd.DataFrame, config: Config): if "id_columns" not in config: config["id_columns"] = dict([(c, 'string_' + c) for c in df if c.startswith("id_")]) log("Id columns: " + ", ".join(config["id_columns"]), config.verbose) if len(config["id_columns"]) > 0: df.rename(columns=config["id_columns"], inplace=True)
def drop_constant_columns(df: pd.DataFrame, config: Config): if "constant_columns" not in config: config["constant_columns"] = [c for c in df if c.startswith("number_") and not (df[c] != df[c].iloc[0]).any()] log("Constant columns: " + ", ".join(config["constant_columns"])) if len(config["constant_columns"]) > 0: df.drop(config["constant_columns"], axis=1, inplace=True)
def backup_database(): """Backup the SQLite3 database.""" log('Backing up SQLite3 database') now = datetime.now() backup = f'{DB_FILE[:-3]}_{now.strftime("%Y-%m-%d")}.db' cmd = f'cp {DB_FILE} {backup}' subprocess.check_call(cmd, shell=True)
def _get_raw(cls, resource, part, max_results = None, **kwargs): if max_results is not None and max_results < cls._max_results_per_request: max_results_per_request = max_results else: max_results_per_request = cls._max_results_per_request results = [] request = resource.list(part = part, maxResults = max_results_per_request, **kwargs) while request and (max_results is None or len(results) < max_results): util.log('Requesting {} ...', request.uri) try: response = request.execute() except googleapiclient.http.HttpError as e: # The HttpError class is currently broken and does not decode the received data before parsing it. if isinstance(e.content, bytes): e.content = e.content.decode() raise results.extend(map(_Item.wrap_json, response.get('items', []))) request = resource.list_next(request, response) return results[:max_results]
def validate(preds: pd.DataFrame, target_csv: str, mode: str, verbose: int=0) -> np.float64: # .rename(columns={'prediction':'target'}) df = pd.merge(preds, pd.read_csv(target_csv), on="line_id", left_index=True) score = roc_auc_score(df.target.values, df.prediction.values) if mode == "classification" else \ np.sqrt(mean_squared_error(df.target.values, df.prediction.values)) log("Score: {:0.4f}".format(score), verbose) return score
def preprocess_pipeline(df: pd.DataFrame, config: Config): drop_columns(df, config) date_cols = list(df.filter(like='datetime_')) str_cols = list(df.filter(like='string_')) num_cols = list(df.filter(like='number_')) id_cols = list(df.filter(like='id_')) for c in id_cols + num_cols: if str(df[c].dtype) == 'object': log(f'column {c} is object (expected numerical type), casted as category' ) df[c] = df[c].astype('category').cat.as_ordered().cat.codes df = add_is_na_cols(df, config) df = fillna(df, config) df = downcast(df, config) non_negative_target_detect(df, config) if len(date_cols) != 0: df = process_datetime(df, date_cols, config) if len(str_cols) != 0: df = process_strings(df, str_cols, config) df = mean_encode_kf(df, str_cols, 5, config) return df
def fetchData(self, force=False): self.income.fetchData(force) self.balance.fetchData(force) self.cashflow.fetchData(force) self.keyRatio.fetchData(force) self.quote.fetchData(force) log('Fetch all data finish!')
def ingest(args): """Ingest datasets into the SQLite3 database.""" for ingest, module in INGESTS: if ingest in args.datasets: log(SEPARATOR) module.ingest() log(SEPARATOR)
def set_choice(chosen_id): log("Updating tracer choice to " + str(chosen_id)) trace = get_trace() trace.choice = chosen_id db.session.add(trace) db.session.commit() log("tracer updated")
def query(self, sql): try: self._cur.execute("SET NAMES utf8") result = self.execute(sql) except MySQLdb.Error, e: self.error_code = e.args[0] util.log("Query sql error:%d %s" % (e.args[0], e.args[1]), 3, "mysql") result = False
def train_lightgbm(X: pd.DataFrame, y: pd.Series, config: Config): params = { "objective": "regression" if config["mode"] == "regression" else "binary", "metric": "rmse" if config["mode"] == "regression" else "auc", "verbosity": -1, "seed": 1, } X_sample, y_sample = data_sample(X, y) hyperparams = hyperopt_lightgbm(X_sample, y_sample, params, config) n_split = config["n_split_lgb"] kf = KFold(n_splits=n_split, random_state=2018, shuffle=True) config["model"] = [] oofs = np.zeros((X.shape[0],)) scores = [] #time_reserved = (config["h2o_min_time_allowance"] + config["other_time_allowance"]) iter_time = 0 iter_times = [] for i, (train_ind, test_ind) in enumerate(kf.split(X)): time_spent = (time.time() - config["start_time"]) time_left = max(0, (config["time_limit"] - time_spent)) # reserving time for h2o if needed #if config["train_h2o"] and (time_left > time_reserved): time_left = max(0, time_left - time_reserved) max_iter_time = max(iter_times) if len(iter_times) > 0 else 0 #assume iterations take same time. if no time left, break if max_iter_time * config["iter_time_coeff"] > time_left: break iter_start = time.time() X_train, X_val = X.iloc[train_ind, :], X.iloc[test_ind,:] y_train, y_val = y[train_ind], y[test_ind] train_data = lgb.Dataset(X_train, label=y_train) valid_data = lgb.Dataset(X_val, label=y_val) mdl = lgb.train({**params, **hyperparams}, train_data, 3000, valid_data, early_stopping_rounds=50, verbose_eval=100) config["model"].append(mdl) oof = mdl.predict(X_val) oofs[test_ind] = oof if config["mode"] == "regression": score = np.sqrt(mean_squared_error(y_val, oof )) else: score = roc_auc_score(y_val,oof ) scores.append(score) iter_time = time.time() - iter_start iter_times.append(iter_time) log(f"FOLD: {i}, Score: {round(score,2)} , time: {iter_time:.2f}") log(f"Total score: {np.mean(scores)} , std: {np.std(scores)}")
def _load_users(self): users = getpwall() if self.limit_to_group: users = [u for u in users if u.pw_gid == self.group.gr_gid] if len(users) < self.minimum_users_count: log("too few users found... check configuration (got %u, need %u)" % (len(users), self.minimum_users_count)) exit(1) self.users = users
def get_taxa(): """Build a dictionary of scientific names and taxon_ids.""" log(f'Getting {DATASET_ID} taxa') sql = """SELECT taxon_id, sci_name FROM taxa WHERE sci_name IN ({})""" sql = sql.format(','.join([f"'{x}'" for x in TARGETS])) taxa = pd.read_sql(sql, db.connect()) return taxa.set_index('sci_name').taxon_id.to_dict()
def _load_users(self): users = getpwall() if self.limit_to_group: users = [u for u in users if u.pw_gid == self.group.gr_gid] if len(users) < self.minimum_users_count: log("too few users found... check configuration (got %u, need %u)" % ( len(users), self.minimum_users_count)) exit(1) self.users = users
def update(self, sql): try: self._cur.execute("SET NAMES utf8") result=self.execute(sql) self._conn.commit() except MySQLdb.Error, e: self.error_code = e.args[0] util.log("[MySQL]Update sql error:%d %s" % (e.args[0], e.args[1]), 3, "mysql") result = False
def drop_constant_columns(df: pd.DataFrame, config: Config): if "constant_columns" not in config: config["constant_columns"] = get_constant_columns(df) log("Constant columns: " + ", ".join(config["constant_columns"]), config.verbose) if len(config["constant_columns"]) > 0: df.drop(config["constant_columns"], axis=1, inplace=True)
def insert(self, sql): try: self._cur.execute("SET NAMES utf8") self.execute(sql) self._conn.commit() return int(self._cur.lastrowid) except MySQLdb.Error, e: self.error_code = e.args[0] util.log("[MySQL]Insert sql error:%d %s" % (e.args[0], e.args[1]), 3, "mysql") return False
def validate(preds: pd.DataFrame, target_csv: str, mode: str, config: Config) -> np.float64: df = pd.merge(preds, pd.read_csv(target_csv), on="line_id", left_index=True) score = roc_auc_score(df.target.values, df.prediction.values) if mode == "classification" else \ np.sqrt(mean_squared_error(df.target.values, df.prediction.values)) log("Score: {:0.4f}".format(score)) return score
def hyperopt_xgboost(X: pd.DataFrame, y: pd.Series, params: Dict, config: Config): X_train, X_val, y_train, y_val = data_split(X, y, test_size=0.5) train_data = xgb.DMatrix(X_train, label=y_train) test_data = xgb.DMatrix(X_val, label=y_val) space = { "max_depth": hp.choice("max_depth", [4, 5, 6]), "min_child_weight": hp.choice("min_child_weight", [4, 8, 12, 16]), "gamma": hp.quniform("gamma", 0.1, 0.5, 0.1), "subsample": hp.choice("subsample", [i / 10.0 for i in range(6, 10)]), "colsample_bytree": hp.choice("colsample_bytree", [i / 10.0 for i in range(6, 10)]), "reg_alpha": hp.choice("reg_alpha", [0, 0.001, 0.005, 0.01, 0.05]), } def objective(hyperparams): watchlist = [(train_data, "train"), (test_data, "test")] mdl = xgb.train({ **params, **hyperparams }, train_data, evals=watchlist, num_boost_round=300, early_stopping_rounds=100, verbose_eval=100) score = mdl.best_score if config.is_classification(): score = -score return {'loss': score, 'status': STATUS_OK} trials = Trials() best = hyperopt.fmin(fn=objective, space=space, trials=trials, algo=tpe.suggest, max_evals=50, verbose=1, rstate=np.random.RandomState(1)) hyperparams = space_eval(space, best) log("{:0.4f} {}".format(trials.best_trial['result']['loss'], hyperparams)) return hyperparams
def drop_constant_columns(df: pd.DataFrame, config: Config): if "constant_columns" not in config: config["constant_columns"] = [ c for c in df if df[c].nunique(dropna=False) < 2 ] log("Constant columns: " + ", ".join(config["constant_columns"])) if len(config["constant_columns"]) > 0: df.drop(config["constant_columns"], axis=1, inplace=True, errors='ignore')
def read_raw_data(): """Get the raw data.""" log(f'Reading {DATASET_ID} raw data') converters = {c: str for c in pd.read_excel(DATA_XLSX, sheet_name=DATA_SHEET).columns} raw_data = pd.read_excel( DATA_XLSX, sheet_name=DATA_SHEET, converters=converters) raw_data = raw_data.rename(columns={'Genus': 'genus'}) raw_data['dataset_id'] = DATASET_ID return raw_data
def __init__(self, dbconfig): try: self._conn = MySQLdb.connect(host=dbconfig['host'], port=dbconfig['port'], user=dbconfig['user'], passwd=dbconfig['passwd'], db=dbconfig['db'], charset=dbconfig['charset']) except MySQLdb.Error, e: self.error_code = e.args[0] util.log("MySQL error:%d %s" % (e.args[0], e.args[1]), 3, "mysql") return
def get_trace(): if 'active_trace' not in session: log("No active trace found") return tid = session['active_trace'] trace = models.Trace.query.get(tid) if trace is None: log("ERR Trace is empty") return return trace
def optimize_dataframe(df): """Optimize pandas dataframe size: - downcast numeric (int and float) types columns. - convert to Categorical type categorical columns with 2x or more "values/unique" values rate. :param df: :return: """ #return df # TODO: remove - check for failure!!! int_cols = [] float_cols = [] category_cols = [] other_cols = [] old_size = sys.getsizeof(df) for col_name in df.columns: col_type = df.dtypes[col_name] if col_type in ['int', 'int32', 'int64']: int_cols.append(col_name) elif col_type in ['float', 'float32', 'float64']: float_cols.append(col_name) elif col_type == 'object': total = len(df[col_name]) n_uniq = df[col_name].nunique() if n_uniq / total < 0.5: category_cols.append(col_name) else: other_cols.append(col_name) else: other_cols.append(col_name) df_opt = pd.DataFrame() if len(int_cols) > 0: df_opt[int_cols] = df[int_cols].apply(pd.to_numeric, downcast='integer') if len(float_cols) > 0: df_opt[float_cols] = df[float_cols].apply(pd.to_numeric, downcast='float') if len(category_cols) > 0: df_opt[category_cols] = df[category_cols].astype('category') if len(other_cols) > 0: df_opt[other_cols] = df[other_cols] new_size = sys.getsizeof(df_opt) log('optimize dataframe ({} to {}, ratio: {})'.format(old_size, new_size, round(old_size/new_size, 2))) return df
def create(): """Create the database.""" log(f'Creating database') script = fspath(SCRIPT_PATH / 'create_db.sql') cmd = f'sqlite3 {DB_FILE} < {script}' if exists(DB_FILE): remove(DB_FILE) subprocess.check_call(cmd, shell=True) insert_db_version()
def validate_dataset(alias: str, mode: str, train_limit: int) -> np.float64: log(alias) automl = AutoML("models/check_{}".format(alias)) automl.config["time_limit"] = train_limit automl.train("data/check_{}/train.csv".format(alias), mode) automl.config["time_limit"] = 300 _, score = automl.predict("data/check_{}/test.csv".format(alias), "predictions/check_{}.csv".format(alias)) return score
def subsample(df: pd.DataFrame, config: Config, max_size_mb: float=2.0): if config.is_train(): df_size_mb = df.memory_usage(deep=True).sum() / 1024 / 1024 if df_size_mb > max_size_mb: mem_per_row = df_size_mb / len(df) sample_rows = int(max_size_mb / mem_per_row) log("Size limit exceeded: {:0.2f} Mb. Dataset rows: {}. Subsample to {} rows.".format(df_size_mb, len(df), sample_rows)) _, df_drop = train_test_split(df, train_size=sample_rows, random_state=1) df.drop(df_drop.index, inplace=True) config["nrows"] = sample_rows else: config["nrows"] = len(df)
def get_capture_date(file_path): try: metadata = exiftool(file_path) except CommandError as e: log('{}', e) return None capture_date_attributes = \ 'DateTimeOriginal MediaCreateDate CreateDate CreationDate ' \ 'DateCreated'.split() for i in capture_date_attributes: value = metadata.get(i) if value is not None: break else: log( 'Did not find any of the recognized metadata fields {}. File {} has the fields {}.', ', '.join(capture_date_attributes), file_path, metadata) return None pattern = '(?P<year>[0-9]{4})[:-]' \ '(?P<month>[0-9]{2})[:-]' \ '(?P<day>[0-9]{2}) ' \ '(?P<hour>[0-9]{2}):' \ '(?P<minute>[0-9]{2}):' \ '(?P<second>[0-9]{2})(\.[0-9]+)?([+-[0-9]{2}:[0-9]{2})?' match = re.match(pattern, value) assert match, 'Could not parse date: {}'.format(value) return datetime.datetime( *( int(match.group(i)) for i in 'year month day hour minute second'.split()))
def execute_safely(self, function, *args, **kwargs): """ Method prints what would be done if simulating or does it otherwise. """ def call_as_pretty_string(): return "%s.%s(%s, %s)" % ( function.__module__, function.__name__, ', '.join((repr(arg) for arg in args)), ', '.join(( "%s=%s" % (repr(k), repr(v)) for k, v in kwargs.items())), ) if self.simulate: log("simulating - would execute %s otherwise" % ( call_as_pretty_string() )) return None else: log("executing " + call_as_pretty_string()) return function(*args, **kwargs)
def do_archive_directory(self, directory_path): """ Archives directory contents to trash if not empty. Returns True on success, False otherwise """ trash_file_path = self.trash_file_path(directory_path) archive_path = self.execute_safely( make_archive, trash_file_path, "bztar", dirname(directory_path), basename(directory_path) ) if not archive_path: log(u"ERROR: something went wrong - no archive " + "file name found after archive creation!") return False self.execute_safely( chmod, archive_path, self.octal_permissions) return True
def help_and_exit(): log("script for maintaining an UNIX groups accounts and home directories") log("") log("usage: [python3] ./sftponly.py [config file]") log(" explicit call of 'python3' turns on debug") exit(0)
for i in walk_visible_files(dir): file_dir, file_name = os.path.split(i) name_part, _ = os.path.splitext(file_name) try: date = datetime.datetime.strptime( name_part[:19], '%Y-%m-%d %H.%M.%S') target_dir = dir_for_date(date) except ValueError: target_dir = os.path.join(dir, 'unknown') move_to(i, os.path.join(target_dir, file_name)) check_empty_dirs.append(file_dir) while check_empty_dirs: empty_dirs = check_empty_dirs check_empty_dirs = [] for i in empty_dirs: if is_empty(i): remove_dir(i) check_empty_dirs.append(os.path.dirname(i)) try: main(*sys.argv[1:]) except KeyboardInterrupt: log('Operation interrupted.')