def custom_corrections_checker( custom_corrections_dataset: dataiku.Dataset) -> Dict: """Utility function to check the content of the optional custom corrections dataset Args: custom_corrections_dataset: Dataset instance with the first column for words and the second one for their correction Returns: Dictionary of words (key) and their custom correction (value) """ dataset_schema = custom_corrections_dataset.get_config()["schema"] columns = dataset_schema["columns"] if len(columns) != 2: raise PluginParamValidationError( "Custom corrections dataset must have only two columns") (word_column, correction_column) = (columns[0], columns[1]) if word_column["type"] != "string" or correction_column["type"] != "string": raise PluginParamValidationError( "Columns of custom corrections dataset must be of string type") df = custom_corrections_dataset.get_dataframe(infer_with_pandas=False) df = clean_text_df(df, dropna_columns=[word_column["name"] ]).fillna("").astype(str) custom_corrections_dict = { row[0]: row[1] for row in df.itertuples(index=False) } return custom_corrections_dict
def set_column_descriptions(output_dataset: dataiku.Dataset, column_descriptions: Dict, input_dataset: dataiku.Dataset = None) -> None: """Set column descriptions of the output dataset based on a dictionary of column descriptions Retain the column descriptions from the input dataset if the column name matches. Args: output_dataset: Output dataiku.Dataset instance column_descriptions: Dictionary holding column descriptions (value) by column name (key) input_dataset: Optional input dataiku.Dataset instance in case you want to retain input column descriptions """ output_dataset_schema = output_dataset.read_schema() input_dataset_schema = [] input_columns_names = [] if input_dataset is not None: input_dataset_schema = input_dataset.read_schema() input_columns_names = [col["name"] for col in input_dataset_schema] for output_col_info in output_dataset_schema: output_col_name = output_col_info.get("name", "") output_col_info["comment"] = column_descriptions.get(output_col_name) if output_col_name in input_columns_names: matched_comment = [ input_col_info.get("comment", "") for input_col_info in input_dataset_schema if input_col_info.get("name") == output_col_name ] if len(matched_comment) != 0: output_col_info["comment"] = matched_comment[0] output_dataset.write_schema(output_dataset_schema)
def custom_vocabulary_checker( custom_vocabulary_dataset: dataiku.Dataset) -> Set: """Utility function to check the content of the optional custom vocabulary dataset Args: custom_vocabulary_dataset: Dataset with a single column for words that should not be corrected Returns: Set of words in the custom vocabulary """ dataset_schema = custom_vocabulary_dataset.get_config()["schema"] columns = dataset_schema["columns"] if len(columns) != 1: raise PluginParamValidationError( "Custom vocabulary dataset must have only one column") col_name = columns[0]["name"] col_type = columns[0]["type"] if col_type != "string": raise PluginParamValidationError( "Column of custom vocabulary dataset must be of string type") df = clean_text_df( custom_vocabulary_dataset.get_dataframe(infer_with_pandas=False)) custom_vocabulary = set(df[col_name].astype(str).tolist()) return custom_vocabulary
def count_records(dataset: dataiku.Dataset) -> int: """ Count the number of records of a dataset using the Dataiku dataset metrics API """ metric_id = "records:COUNT_RECORDS" dataset_name = dataset.name.split(".")[1] partitions = dataset.read_partitions client = dataiku.api_client() project = client.get_project(dataiku.default_project_key()) logging.info("Counting records of dataset: {}".format(dataset_name)) if partitions is None or len(partitions) == 0: project.get_dataset(dataset_name).compute_metrics(metric_ids=[metric_id]) metric = dataset.get_last_metric_values() record_count = dataiku.ComputedMetrics.get_value_from_data(metric.get_global_data(metric_id=metric_id)) logging.info("Dataset contains {:d} records and is not partitioned".format(record_count)) else: record_count = 0 for partition in partitions: project.get_dataset(dataset_name).compute_metrics(partition=partition, metric_ids=[metric_id]) metric = dataset.get_last_metric_values() record_count += dataiku.ComputedMetrics.get_value_from_data( metric.get_partition_data(partition=partition, metric_id=metric_id) ) logging.info("Dataset contains {:d} records in partition(s) {}".format(record_count, partitions)) return record_count
def set_column_description( input_dataset: dataiku.Dataset, output_dataset: dataiku.Dataset, column_description_dict: Dict, ) -> None: """ Set column descriptions of the output dataset based on a dictionary of column descriptions and retains the column descriptions from the input dataset if the column name matches """ input_dataset_schema = input_dataset.read_schema() output_dataset_schema = output_dataset.read_schema() input_columns_names = [col["name"] for col in input_dataset_schema] for output_col_info in output_dataset_schema: output_col_name = output_col_info.get("name", "") output_col_info["comment"] = column_description_dict.get( output_col_name) if output_col_name in input_columns_names: matched_comment = [ input_col_info.get("comment", "") for input_col_info in input_dataset_schema if input_col_info.get("name") == output_col_name ] if len(matched_comment) != 0: output_col_info["comment"] = matched_comment[0] output_dataset.write_schema(output_dataset_schema)
def load_input_output(config): if not get_input_names_for_role("input_dataset"): raise ValueError("No input dataset.") input_dataset_name = get_input_names_for_role("input_dataset")[0] config.input_dataset = Dataset(input_dataset_name) output_dataset_name = get_output_names_for_role("output_dataset")[0] config.output_dataset = Dataset(output_dataset_name)
def __init__(self, dataset=None, database=None, connection=None, vtype=None): if connection and dataset or connection and database or database and dataset: raise ValueError( "only one of connection, database or dataset should be given") if dataset: if isinstance(dataset, Dataset): self._vconn = dataset.full_name else: self._vconn = Dataset(dataset).full_name self._find_connection_from_dataset = True elif connection: self._vconn = "@virtual(%s):connection:%s" % (vtype, connection) self._find_connection_from_dataset = False elif database: self._vconn = "@virtual(%s):%s" % (vtype, database) self._find_connection_from_dataset = False else: self._vconn = None self._find_connection_from_dataset = None print("Vconn = %s find=%s" % (self._vconn, self._find_connection_from_dataset))
def _add_output_dataset(self): output_dataset_name = get_output_names_for_role("tagged_documents")[0] self.dku_config.add_param( name="output_dataset", value=Dataset(output_dataset_name), required=True, )
def __init__(self): """Instanciate class with DkuConfigLoading and add input datasets to dku_config""" super().__init__() text_input = get_input_names_for_role("document_dataset")[0] self.dku_config.add_param( name="text_input", value=Dataset(text_input), required=True ) ontology_input = get_input_names_for_role("ontology_dataset")[0] self.dku_config.add_param( name="ontology_input", value=Dataset(ontology_input), required=True ) self.document_dataset_columns = [ p["name"] for p in self.dku_config.text_input.read_schema() ] self.ontology_dataset_columns = [ p["name"] for p in self.dku_config.ontology_input.read_schema() ]
def process_dataset_chunks(input_dataset: dataiku.Dataset, output_dataset: dataiku.Dataset, func: Callable, chunksize: float = 1000, **kwargs) -> None: """Read a dataset by chunks, process each dataframe chunk with a function and write back to another dataset. Pass keyword arguments to the function, adds a tqdm progress bar and generic logging. Directly write chunks to the output_dataset, so that only one chunk needs to be processed in-memory at a time. Args: input_dataset: Input dataiku.Dataset instance output_dataset: Output dataiku.Dataset instance func: The function to apply to the `input_dataset` by chunks of pandas.DataFrame This function must take a pandas.DataFrame as first input argument, and output another pandas.DataFrame chunksize: Number of rows of each chunk of pandas.DataFrame fed to `func` **kwargs: Optional keyword arguments fed to `func` Raises: ValueError: If the input dataset is empty or if pandas cannot read it without type inference """ input_count_records = count_records(input_dataset) if input_count_records == 0: raise ValueError("Input dataset has no records") logging.info( f"Processing dataset {input_dataset.name} of {input_count_records} rows by chunks of {chunksize}..." ) start = perf_counter() # First, initialize output schema if not present. Required to show the real error if `iter_dataframes` fails. if not output_dataset.read_schema(raise_if_empty=False): df = input_dataset.get_dataframe(limit=5, infer_with_pandas=False) output_df = func(df=df, **kwargs) output_dataset.write_schema_from_dataframe(output_df) with output_dataset.get_writer() as writer: df_iterator = input_dataset.iter_dataframes(chunksize=chunksize, infer_with_pandas=False) len_iterator = math.ceil(input_count_records / chunksize) for i, df in tqdm(enumerate(df_iterator), total=len_iterator, unit="chunk", mininterval=1.0): output_df = func(df=df, **kwargs) if i == 0: output_dataset.write_schema_from_dataframe( output_df, dropAndCreate=bool(not output_dataset.writePartition)) writer.write_dataframe(output_df) logging.info( f"Processing dataset {input_dataset.name} of {input_count_records} rows: " + f"Done in {perf_counter() - start:.2f} seconds.")
def output_generator(): logging.info("Start output generator ...") (names, dtypes, parse_date_columns) = Dataset.get_dataframe_schema_st( preparation_output_schema["columns"], parse_dates=True, infer_with_pandas=False) logging.info("Reading with INITIAL dtypes: %s" % dtypes) dtypes = utils.ml_dtypes_from_dss_schema( preparation_output_schema, preprocessing_params["per_feature"]) logging.info("Reading with dtypes: %s" % dtypes) for input_df in input_dataset.iter_dataframes_forced_types( names, dtypes, parse_date_columns, chunksize=100000): input_df.index = range(input_df.shape[0]) input_df_orig = input_df.copy() if recipe_desc.get("filterInputColumns", False): input_df_orig = input_df_orig[recipe_desc["keptInputColumns"]] logging.info("Got a dataframe : %s" % str(input_df.shape)) normalize_dataframe(input_df, preprocessing_params['per_feature']) for col in input_df: logging.info("NORMALIZED: %s -> %s" % (col, input_df[col].dtype)) logging.info("Processing it") transformed = pipeline.process(input_df) logging.info("Applying it") (labels_arr, additional_columns) = clustering_predict(modeling_params, clf, transformed) cluster_labels = pd.Series(labels_arr, name="cluster_labels").map(naming) cluster_labels.index = transformed["TRAIN"].index final_df = pd.concat([ input_df_orig.join(cluster_labels, how='left'), additional_columns ], axis=1) if preprocessing_params["outliers"]["method"] == "CLUSTER": outliers_cluter_name = cluster_name_map.get( constants.CLUSTER_OUTLIERS, constants.CLUSTER_OUTLIERS) final_df['cluster_labels'].fillna(outliers_cluter_name, inplace=True) logging.info("Done predicting it") yield final_df
def __init__(self, connection=None, dataset=None): if connection and dataset: raise ValueError( "only one of connection or dataset should be given") if dataset: if isinstance(dataset, Dataset): self._iconn = dataset.full_name else: self._iconn = Dataset(dataset).full_name self._find_connection_from_dataset = True else: self._iconn = connection self._find_connection_from_dataset = False
def count_records(dataset: dataiku.Dataset) -> int: """Count the number of records of a dataset using the Dataiku dataset metrics API Args: dataset: dataiku.Dataset instance Returns: Number of records """ metric_id = "records:COUNT_RECORDS" partitions = dataset.read_partitions client = dataiku.api_client() project = client.get_project(dataset.project_key) record_count = 0 logging.info(f"Counting records of dataset: {dataset.name}...") if partitions is None or len(partitions) == 0: project.get_dataset( dataset.short_name).compute_metrics(metric_ids=[metric_id]) metric = dataset.get_last_metric_values() record_count = dataiku.ComputedMetrics.get_value_from_data( metric.get_global_data(metric_id=metric_id)) logging.info( f"Dataset {dataset.name} contains {record_count:d} records and is not partitioned" ) else: for partition in partitions: project.get_dataset(dataset.short_name).compute_metrics( partition=partition, metric_ids=[metric_id]) metric = dataset.get_last_metric_values() record_count += dataiku.ComputedMetrics.get_value_from_data( metric.get_partition_data(partition=partition, metric_id=metric_id)) logging.info( f"Dataset {dataset.name} contains {record_count:d} records in partition(s) {partitions}" ) return record_count
def df_from_split_desc_no_normalization(split_desc, split, feature_params, prediction_type=None): if split_desc["format"] != "csv1": raise Exception("Unsupported format") (names, dtypes, parse_date_columns) = Dataset.get_dataframe_schema_st( split_desc["schema"]["columns"], parse_dates=True, infer_with_pandas=True) if split == "full": f = split_desc["fullPath"] else: f = split == "train" and split_desc["trainPath"] or split_desc[ "testPath"] # We infer everything with Pandas, EXCEPT booleans. # Because then pandas completely looses the original syntax # So for example if target is true/false, and we let pandas infer, then it will become # True/False, and when we remap, we try to remap with true/false and end up with no # target at all # for col in split_desc["schema"]["columns"]: # if col["type"] == "boolean": # if dtypes is None: # dtypes = {} # dtypes[col["name"]] = "str" logging.info("Reading with dtypes: %s" % dtypes) dtypes = utils.ml_dtypes_from_dss_schema(split_desc["schema"], feature_params, prediction_type=prediction_type) logging.info("Reading with FIXED dtypes: %s" % dtypes) df = pd.read_table(f, names=names, dtype=dtypes, header=None, sep='\t', doublequote=True, quotechar='"', parse_dates=parse_date_columns, float_precision="round_trip") logging.info("Loaded table") return df
def run(self, progress_callback): clobber = self.config.get("clobber", False) prefix = self.config.get("prefix") connections = set() done = 0 for project_key in self.project_keys: project = self.client.get_project(project_key) for dataset_name in Dataset.list(project_key=project_key): d = project.get_dataset(dataset_name) connection_name = d.get_definition().get('params', {}).get( 'connection', None) if connection_name is not None: connections.add(connection_name) sql_notebooks = intercom.backend_json_call( "sql-notebooks/list/", data={"projectKey": project_key}) for sql_notebook in sql_notebooks: connection_name = sql_notebook.get('connection', None) if connection_name is not None: m = re.search('@virtual\(([^\)]+)\):(.*)', connection_name) if m is not None: connection_name = 'hive-%s' % m.group(2) connections.add(connection_name) meta = project.get_metadata() # Update tags list if clobber: tags = [x for x in meta["tags"] if not x.startswith(prefix)] else: tags = meta["tags"] tags.extend([ "%s%s" % (prefix, connection) for connection in list(connections) ]) meta["tags"] = tags project.set_metadata(meta) done += 1 progress_callback(done)
def process_dataset_chunks( input_dataset: dataiku.Dataset, output_dataset: dataiku.Dataset, func: Callable, chunksize: float = 10000, **kwargs ) -> None: """ Read a dataset by chunks, process each dataframe chunk with a function and write back to another dataset. Automatically adds a tqdm progress bar and generic logging. """ logging.info("Processing dataframe chunks of size {:d})...".format(chunksize)) with output_dataset.get_writer() as writer: df_iterator = input_dataset.iter_dataframes(chunksize=chunksize, infer_with_pandas=False) len_iterator = math.ceil(count_records(input_dataset) / chunksize) for i, df in tqdm(enumerate(df_iterator), total=len_iterator): output_df = func(df=df, **kwargs) if i == 0: if output_dataset.writePartition is None or output_dataset.writePartition == "": output_dataset.write_schema_from_dataframe(output_df, dropAndCreate=True) else: output_dataset.write_schema_from_dataframe(output_df) writer.write_dataframe(output_df) logging.info("Processing dataframe chunks: Done!")
def _streamed_query_to_df(connection, query, pre_queries, post_queries, find_connection_from_dataset, db_type, extra_conf={}, infer_from_schema=False, parse_dates=True, bool_as_str=False, dtypes=None, script_steps=None, script_input_schema=None, script_output_schema=None): import pandas as pd data = { "connection": connection, "query": query, "preQueries": json.dumps(pre_queries), "postQueries": json.dumps(post_queries), "findConnectionFromDataset": find_connection_from_dataset, "dbType": db_type, "extraConf": json.dumps(extra_conf), "scriptSteps": json.dumps(script_steps) if script_steps is not None else None, "scriptInputSchema": json.dumps(script_input_schema) if script_input_schema is not None else None, "scriptOutputSchema": json.dumps(script_output_schema) if script_output_schema is not None else None } logging.info("Starting SQL query reader") # initiate the streaming (blocks until the database says it's ready to return values) streamingSession = backend_json_call("sql-queries/start-streaming", data=data) logging.info("Got initial SQL query response") queryId = streamingSession['queryId'] # handle the special case of 'nothing to stream' if streamingSession['hasResults'] == False: return pd.DataFrame() parse_date_columns = None if infer_from_schema and "schema" in streamingSession: schema_columns = streamingSession["schema"] (inferred_names, inferred_dtypes, inferred_parse_date_columns) = Dataset.get_dataframe_schema_st( schema_columns, parse_dates=parse_dates, bool_as_str=bool_as_str) dtypes = inferred_dtypes parse_date_columns = inferred_parse_date_columns # fetch the data... resp_stream = backend_stream_call("sql-queries/stream", data={"queryId": queryId}, err_msg="Query failed") # ... and stuff it (streamed) in a dataframe results = pd.read_table(resp_stream, sep='\t', doublequote=True, quotechar='"', dtype=dtypes, parse_dates=parse_date_columns) # query seems to have run fine. 'Seems'. Verify that. # note to self: this call has to be made after the dataframe creation, because it is streamed and the call # returns before the query is actually done backend_void_call("sql-queries/verify", data={"queryId": queryId}, err_msg="Query failed") return results
def run(self, progress_callback): """ If `run` is successful, we use the method success() to return an HTML message. In case of an error, we don't return the error in such an HTML message but we raise an Error instead so it is considered as a failed step if called from a scenario. """ if 'input_dataset' in self.config: # the macro is run from the flow dataset_name = self.config.get('input_dataset') else: # the macro is run from a scenario dataset_name = self.config.get('dataset') if not dataset_name: logging.error( 'The mandatory param `dataset` is missing or invalid to export dataset to Snowflake stage' ) raise ValueError( f"The mandatory parameter `Dataset to export` is invalid") # We use the API `dataiku.core.dataset.Dataset.get_location_info` rather than `dataikuapi.dss.dataset.DSSDataset.get_settings().get_raw_params()` # because it expands variables if any in the connection settings (see https://doc.dataiku.com/dss/latest/variables/index.html) dataset_connection_info = Dataset( dataset_name).get_location_info()["info"] if dataset_connection_info.get("databaseType") != 'Snowflake': logging.error( 'Cannot export non Snowflake dataset `%s.%s` to Snowflake stage', self.project_key, dataset_name) raise ValueError(f"'{dataset_name}' is not a Snowflake dataset") mandatory_params = [{"name": "Snowflake stage", "id": "stage"}] for param in mandatory_params: if param['id'] not in self.config or not self.config.get( param['id']): logging.error( 'The mandatory param `%s` is missing or invalid to export dataset `%s.%s` to Snowflake stage', param['name'], self.project_key, dataset_name) raise ValueError(f"The parameter '{param['name']}' is invalid") fully_qualified_stage_name = self.config.get('stage') output_path = (self.config.get('path') or self.project_key).strip(' ').strip('/') destination = os.path.join(output_path, dataset_name) file_format_param = self.config.get('file_format') or 'default' file_format = '' if file_format_param == 'default' else f"FILE_FORMAT = (FORMAT_NAME = {file_format_param})" overwrite = 'OVERWRITE = TRUE' if self.config.get("overwrite") else '' sql_copy_query = f"COPY INTO @{fully_qualified_stage_name}/{destination} FROM {resolve_table_name(dataset_connection_info)} {file_format} {overwrite}" logging.info("Exporting dataset `%s.%s` with the copy command: `%s`", self.project_key, dataset_name, sql_copy_query) executor = SQLExecutor2(dataset=dataset_name) executor.query_to_df(sql_copy_query) logging.info( f"Successfully exported dataset `{self.project_key}.{dataset_name}` in Snowflake stage `{fully_qualified_stage_name}` to `{destination}`" ) return success( 'The dataset has been successfully exported in stage <strong>%s</strong> to <strong>%s_*</strong>' % (fully_qualified_stage_name.replace('"', ''), destination))
def main(model_folder, input_dataset_smartname, output_dataset_smartname, metrics_dataset_smartname, recipe_desc, script, preparation_output_schema, cond_outputs=None): # Obtain a streamed result of the preparation input_dataset = dataiku.Dataset(input_dataset_smartname) logging.info("Will do preparation, output schema: %s" % preparation_output_schema) input_dataset.set_preparation_steps(script["steps"], preparation_output_schema) core_params = dkujson.load_from_filepath( osp.join(model_folder, "core_params.json")) preprocessing_params = dkujson.load_from_filepath( osp.join(model_folder, "rpreprocessing_params.json")) modeling_params = dkujson.load_from_filepath( osp.join(model_folder, "rmodeling_params.json")) collector_data = dkujson.load_from_filepath( osp.join(model_folder, "collector_data.json")) preprocessing_handler = PreprocessingHandler.build(core_params, preprocessing_params, model_folder) preprocessing_handler.collector_data = collector_data pipeline = preprocessing_handler.build_preprocessing_pipeline( with_target=True) with open(osp.join(model_folder, "clf.pkl"), "rb") as f: clf = pickle.load(f) logging.info("Scoring data") (names, dtypes, parse_date_columns) = Dataset.get_dataframe_schema_st( preparation_output_schema["columns"], parse_dates=True, infer_with_pandas=False) logging.info("Reading with INITIAL dtypes: %s" % dtypes) dtypes = utils.ml_dtypes_from_dss_schema( preparation_output_schema, preprocessing_params["per_feature"], prediction_type=core_params["prediction_type"]) logging.info("Reading with dtypes: %s" % dtypes) for i in xrange(0, len(names)): logging.info("Column %s = %s (dtype=%s)" % (i, names[i], dtypes.get(names[i], None))) with input_dataset._stream(infer_with_pandas=True, sampling='head', sampling_column=None, limit=None, ratio=None, columns=names) as stream: input_df = pd.read_table(stream, names=names, dtype=dtypes, header=None, sep='\t', doublequote=True, quotechar='"', parse_dates=parse_date_columns, float_precision="round_trip") input_df_orig = input_df.copy() logging.info("Got a dataframe : %s" % str(input_df.shape)) normalize_dataframe(input_df, preprocessing_params['per_feature']) for col in input_df: logging.info("NORMALIZED: %s -> %s" % (col, input_df[col].dtype)) logging.info("Processing it") transformed = pipeline.process(input_df) logging.info("Predicting it") if core_params["prediction_type"] == constants.BINARY_CLASSIFICATION: pred_df = binary_classification_predict( clf, pipeline, modeling_params, preprocessing_params, preprocessing_handler.target_map, recipe_desc["forcedClassifierThreshold"], input_df, output_probas=recipe_desc["outputProbabilities"], # For ensemble model, we need to indicate that we have target, so that a target-aware pipeline is # selected. See 0c87605 for more information ensemble_has_target=True) # Probability percentile & Conditional outputs has_cond_output = recipe_desc["outputProbabilities"] and cond_outputs has_percentiles = recipe_desc["outputProbaPercentiles"] or ( has_cond_output and len([ co for co in cond_outputs if co["input"] == "proba_percentile" ])) if has_percentiles: model_perf = dkujson.load_from_filepath( osp.join(model_folder, "perf.json")) if model_perf.has_key( "probaPercentiles") and model_perf["probaPercentiles"]: percentile = pd.Series(model_perf["probaPercentiles"]) proba_1 = "proba_" + str( (k for k, v in preprocessing_handler.target_map.items() if v == 1).next()) pred_df["proba_percentile"] = pred_df[proba_1].apply( lambda p: percentile.where(percentile <= p).count() + 1) else: raise Exception( "Probability percentiles are missing from model.") if has_cond_output: for co in cond_outputs: inp = pred_df[co["input"]] acc = inp.notnull() # condition accumulator for r in co["rules"]: if r["operation"] == 'GT': cond = inp > r["operand"] elif r["operation"] == 'GE': cond = inp >= r["operand"] elif r["operation"] == 'LT': cond = inp < r["operand"] elif r["operation"] == 'LE': cond = inp <= r["operand"] pred_df.loc[acc & cond, co["name"]] = r["output"] acc = acc & (~cond) pred_df.loc[acc, co["name"]] = co.get("defaultOutput", "") if has_percentiles and not recipe_desc[ "outputProbaPercentiles"]: # was only for conditional outputs pred_df.drop("proba_percentile", axis=1, inplace=True) elif core_params["prediction_type"] == constants.MULTICLASS: pred_df = multiclass_predict( clf, pipeline, modeling_params, preprocessing_params, preprocessing_handler.target_map, input_df, output_probas=recipe_desc["outputProbabilities"], # For ensemble model, we need to indicate that we have target, so that a target-aware pipeline is # selected. See 0c87605 for more information ensemble_has_target=True) elif core_params["prediction_type"] == constants.REGRESSION: pred_df = regression_predict( clf, pipeline, modeling_params, input_df, # For ensemble model, we need to indicate that we have target, so that a target-aware pipeline is # selected. See 0c87605 for more information ensemble_has_target=True) else: raise ValueError("bad prediction type %s" % core_params["prediction_type"]) # add error information to pred_df y = transformed["target"] target_mapping = {} if core_params["prediction_type"] in [ constants.BINARY_CLASSIFICATION, constants.MULTICLASS ]: target_mapping = { label: int(class_id) for label, class_id in preprocessing_handler.target_map.items() } pred_df = add_evaluation_columns(core_params["prediction_type"], pred_df, y, target_mapping) logging.info("Done predicting it") if recipe_desc.get("filterInputColumns", False): clean_kept_columns = [ c for c in recipe_desc["keptInputColumns"] if c not in pred_df.columns ] else: clean_kept_columns = [ c for c in input_df_orig.columns if c not in pred_df.columns ] output_df = pd.concat([input_df_orig[clean_kept_columns], pred_df], axis=1) # write scored data output_dataset = dataiku.Dataset(output_dataset_smartname) #logging.info("writing scored schema") #output_dataset.write_schema_from_dataframe(output_df) # backend should do this logging.info("writing scored data") output_dataset.write_from_dataframe(output_df) weight_method = core_params.get("weight", {}).get("weightMethod", None) with_sample_weight = weight_method in { "SAMPLE_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT" } if with_sample_weight: sample_weight = transformed["weight"] else: sample_weight = None metrics_df = compute_metrics_df(core_params["prediction_type"], target_mapping, modeling_params, output_df, recipe_desc, y, transformed["UNPROCESSED"], sample_weight) # write metrics dataset if metrics_dataset_smartname: metrics_dataset = dataiku.Dataset(metrics_dataset_smartname) #logging.info("writing metrics schema") #metrics_dataset.write_schema_from_dataframe(metrics_df) # backend should maybe do this ? logging.info("writing metrics data") metrics_dataset.write_from_dataframe(metrics_df)
def scrape(page_num): url = LISTING + str(page_num).strip() # load page content page = urllib.urlopen(url) # find the interesting data with html soup = BeautifulSoup(page) result = soup.find('div', {'id': 'places'}).find('form')['data-results'] listing = json.loads(result) # keep the data in RESULTS data = listing['results'] for e in data: RESULTS.append(e) for p in xrange(1, NPAGES + 1): print "Crawling page", p scrape(p) print "Crawled %i places" % len(RESULTS) # Write in a dataset site_data = Dataset("__FIRST_OUTPUT__") schema = [{'name': 'key', 'type': 'int'}, {'name': 'data', 'type': 'string'}] site_data.write_schema(schema) writer = site_data.get_writer() for i, e in enumerate(RESULTS): data = [i, json.dumps(e)] writer.write_tuple(data)
def output_generator(): logging.info("Start output generator ...") (names, dtypes, parse_date_columns) = Dataset.get_dataframe_schema_st( preparation_output_schema["columns"], parse_dates=True, infer_with_pandas=False) logging.info("Reading with INITIAL dtypes: %s" % dtypes) dtypes = utils.ml_dtypes_from_dss_schema( preparation_output_schema, preprocessing_params["per_feature"], prediction_type=core_params["prediction_type"]) logging.info("Reading with dtypes: %s" % dtypes) for i in xrange(0, len(names)): logging.info("Column %s = %s (dtype=%s)" % (i, names[i], dtypes.get(names[i], None))) for input_df in input_dataset.iter_dataframes_forced_types( names, dtypes, parse_date_columns, chunksize=batch_size, float_precision="round_trip"): input_df.index = range(input_df.shape[0]) input_df_orig = input_df.copy() logging.info("Got a dataframe : %s" % str(input_df.shape)) normalize_dataframe(input_df, preprocessing_params['per_feature']) for col in input_df: logging.info("NORMALIZED: %s -> %s" % (col, input_df[col].dtype)) logging.info("Processing it") logging.info("Predicting it") if core_params[ "prediction_type"] == constants.BINARY_CLASSIFICATION: pred_df = binary_classification_predict( clf, pipeline, modeling_params, preprocessing_params, preprocessing_handler.target_map, recipe_desc["forcedClassifierThreshold"], input_df, output_probas=recipe_desc["outputProbabilities"]) # Probability percentile & Conditional outputs pred_df = binary_classif_scoring_add_percentile_and_cond_outputs( pred_df, recipe_desc, model_folder, cond_outputs, preprocessing_handler.target_map) elif core_params["prediction_type"] == constants.MULTICLASS: pred_df = multiclass_predict( clf, pipeline, modeling_params, preprocessing_params, preprocessing_handler.target_map, input_df, output_probas=recipe_desc["outputProbabilities"]) elif core_params["prediction_type"] == constants.REGRESSION: pred_df = regression_predict(clf, pipeline, modeling_params, input_df) else: raise ValueError("bad prediction type %s" % core_params["prediction_type"]) logging.info("pred df debug :") logging.info(pred_df) logging.info("Done predicting it") if recipe_desc.get("filterInputColumns", False): clean_kept_columns = [ c for c in recipe_desc["keptInputColumns"] if c not in pred_df.columns ] else: clean_kept_columns = [ c for c in input_df_orig.columns if c not in pred_df.columns ] yield pd.concat([input_df_orig[clean_kept_columns], pred_df], axis=1)