def upload_dataset():
     if "file" in request.files:
         file = request.files["file"]
         if file.filename == "":
             abort(400)
         filename = secure_filename(file.filename)
         path = DATASETS_DIRECTORY / filename
         file.save(path)
         Dataset.create_from_path(path).save()
         return {"status": "ok"}, 201
     else:
         abort(400)
 def export_confusion_matrix(id):
     model, _, _ = Dataset.model_from_id(id)
     if model.status != "done":
         return {"error": "Model is not trained"}, 409
     if not model.confusion_matrix_path:
         return {"error": "No confusion matrix available"}, 404
     return send_file(model.confusion_matrix_path, as_attachment=True)
 def delete_config(id):
     app.logger.info(f"Removing config {id}")
     config, dataset = Dataset.config_from_id(id)
     config.delete_data()
     dataset.configs = [c for c in dataset.configs if c.id != config.id]
     dataset.save()
     return jsonify({})
 def create_model(id):
     config, dataset = Dataset.config_from_id(id)
     app.logger.info(f"Creating model with config: {request.json}")
     model = DatasetModel(model_config=request.json)
     config.models.append(model)
     dataset.save()
     return model.to_json(), 201
 def delete_model(id):
     app.logger.info(f"Removing model {id}")
     model, config, dataset = Dataset.model_from_id(id)
     model.delete_data()
     config.models = [m for m in config.models if m.id != model.id]
     dataset.save()
     return jsonify({})
示例#6
0
    def run(self):
        datasetFile = 'CSVdataset - Sheet1.csv'
        df = pd.read_csv(datasetFile)
        # Create a new Faker and tell it how to create User objects

        for index, dataset in df.iterrows():
            # print(dataset['paragraph'])
            dt = Dataset(dataset['paragraph'], dataset['intent'], 'nocontext')
            self.db.session.add(dt)
 def set_dataset_config(id):
     result = Dataset.from_id(id)
     columns = request.json.get("columns")
     label = request.json.get("label")
     model_type = request.json.get("model_type")
     config = DatasetConfig(columns=columns,
                            label=label,
                            model_type=model_type)
     result.configs.append(config)
     app.logger.info(f"Inserting config {request.json}")
     app.logger.info(result.configs)
     result.save()
     return config.to_json(), 201
def load_all_datasets(datasets_directory):
    """Load all unknown datasets into the database """
    datasets_already_loaded = [Path(d.path) for d in Dataset.objects]

    for path in map(Path, os.listdir(datasets_directory)):
        path = (datasets_directory / path).resolve()
        path = datasets_directory / path
        if path.suffix == ".csv" and path not in datasets_already_loaded:
            log.info(f"Loading {path}")
            d = Dataset.create_from_path(path).save()
            log.info(f"Created entry for dataset {path}: {d.to_json()}")
        else:
            log.info(f"Not loading {path}")
def get_dataset_visualization(path: Path,
                              dataset: Dataset,
                              config: DatasetConfig = None):
    """Get or generate the SweetViz vizualisation"""
    if config is not None:
        viz_name = f"{path.name}-{config.id}-sweetviz.html"
    else:
        viz_name = f"{path.name}-sweetviz.html"
    viz_path = path.with_name(viz_name)
    log.info(f"Searching viz file {viz_path}")
    if viz_path.exists():
        log.info("Viz found")
    else:
        # data = pd.read_csv(path, sep=None)
        # data.drop(data.filter(regex="Unname"), axis=1, inplace=True)
        # log.info("Generating viz")
        # if config is not None:
        #     viz = sv.analyze(data, target_feat=config.label,
        #                      feat_cfg=sv.FeatureConfig(force_num=[config.label]))
        # else:
        #     viz = sv.analyze(data)
        # log.info(f"Saving viz to {viz_path}")
        # viz.show_html(filepath=viz_path, open_browser=False)
        if config is not None:
            options = {
                "target_feat": config.label,
                "feat_cfg": sv.FeatureConfig(force_num=[config.label])
            }
            config.visualization_path = str(viz_path)
        else:
            options = {}
            dataset.visualization_path = str(viz_path)

        generate_visulisation(path, viz_path, options).compute()
        dataset.save()
        log.info("Returning viz")

    return viz_path
    def dataset_status(id):
        model: DatasetModel
        model, _, _ = Dataset.model_from_id(id)

        reply = {"status": model.status}

        if model.log_path:
            try:
                with open(model.log_path) as f:
                    reply["logs"] = f.read()
            except FileNotFoundError:
                pass

        return reply
示例#11
0
def store():
    context = 'nocontext'
    intent = request.json['intent']
    paragraph = request.json['paragraph']

    if 'context' in request.json:
        context = request.json['context']

    dataset = Dataset(context=context, intent=intent, paragraph=paragraph)

    db.session.add(dataset)
    db.session.commit()

    data = singleTransform(dataset)

    return responses.created(data, 'Dataset successfully created')
    def predict_result(id):
        dataset: Dataset
        config: DatasetConfig
        model, config, dataset = Dataset.model_from_id(id)

        # Check if model is trained
        if model.status != "done":
            return {"error": "Model is not trained"}, 409

        app.logger.info(f"predicting for dataset {dataset.name}")
        app.logger.info(f"Found configuration {config}")
        data = request.json
        app.logger.info(f"got data {data}")
        mapping = column_mapping.decode_mapping(dataset.column_mapping)
        for line in data:
            for k in line.keys():
                if k in mapping:
                    line[k] = mapping[k][line[k]]
                else:
                    line[k] = float(line[k])
        app.logger.info(f"Decoded data {data}")
        columns_order = [
            col for col in dataset.columns if col in config.columns
            and config.columns[col] and col != config.label
        ]
        app.logger.info(f"columns order {columns_order}")
        data = np.array([[line[col] for col in columns_order]
                         for line in data])
        app.logger.info(f"sorted data {data}")
        with open(model.pickled_model_path, "rb") as f:
            pipeline = pickle.load(f)
        app.logger.info("loaded pipeline")
        result = pipeline.predict(data).tolist()
        app.logger.info(f"Predicted {result}")

        if config.label in mapping:
            result = [
                column_mapping.reconvert_one_value(config.label, value,
                                                   mapping) for value in result
            ]
        return jsonify([{config.label: value} for value in result])
    def train_model(id):
        model, config, dataset = Dataset.model_from_id(id)

        # Check if training is already done or in progress
        if model.status == "done":
            return {"error": "Model is already trained"}, 409
        if model.status not in ["not started", "error"]:
            return {"error": "Model is currently training"}, 409

        app.logger.info(f"Starting training dataset {dataset.name}")
        app.logger.info(f"config: {config.to_json()}")
        app.logger.info(f"model: {model.to_json()}")
        app.logger.info(f"Found configuration {config}")

        # update status
        model.status = "starting"
        dataset.save()

        fut = client.submit(training.train_model, id)
        fire_and_forget(fut)
        return {"status": model.status}, 202
 def lint_config_from_request(id):
     config = request.json
     dataset = Dataset.from_id(id)
     return lint_config(config, dataset)
def train_model(model_id):
    config: DatasetConfig
    model, config, dataset = Dataset.model_from_id(model_id)

    def set_status(status):
        logger.info(f"Setting status of {model.id} to: {status}")
        model.status = status
        dataset.save()

    try:
        # Create the different assets path
        dataset_path = Path(dataset.path)
        model_dir = dataset_path.parent / \
            f"{dataset.name}-model-{str(model.id)}"
        model_dir.mkdir(exist_ok=True)
        log_path = model_dir / "training.log"
        pickled_model_path = model_dir / "pipeline.pickle"
        exported_model_path = model_dir / "pipeline.py"
        shap_model_path = model_dir / "save.png"
        confusion_matrix_path = model_dir / "confusion_matrix.png"

        model.log_path = str(log_path)
        set_status("started")

        # Load the dataset
        mapping = column_mapping.decode_mapping(dataset.column_mapping)
        X, y = get_dataset(dataset_path, config, mapping)
        logger.info(f"Loaded dataset: {X} {y}")
        logger.info(f"Mapping: {mapping}")

        # Copy data before column name drop (using it for shap)
        copy_X = X
        copy_y = y

        # Convert to types TPOT understands
        X = X.to_numpy().astype(np.float64)
        y = y.to_numpy().astype(np.float64)

        # Separate training and testing data with column name
        _, X_test_col, _, y_test_col = train_test_split(copy_X,
                                                        copy_y,
                                                        test_size=0.2)

        # Separate training and testing data
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2)

        logger.info(config.to_json())

        # Split values
        #X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2)

        # Train the model
        classifier = tpot_training(X_train,
                                   y_train,
                                   model.model_config,
                                   log_file=log_path,
                                   model_type=config.model_type)

        # Save best pipeline
        save_res = save_pipeline(classifier, pickled_model_path)

        # Export best pipeline code
        export_res = export_pipeline_code(classifier, exported_model_path)

        # Save shap image
        image_res = save_shap(classifier, shap_model_path, copy_X, copy_y,
                              mapping)

        # Create metrics on the generated pipeline
        analysis_res = analyse_model(classifier, X_train, y_train, X_test,
                                     y_test)

        # Create the confusion matrix
        if config.model_type == "classification":
            matrix_res = create_confusion_matrix(classifier, X_test_col,
                                                 y_test_col,
                                                 confusion_matrix_path)
        else:
            matrix_res = dask.delayed(None)

        # Get the results of the exportation and model saving
        _, _, analysis, *_ = dask.compute(save_res, export_res, analysis_res,
                                          matrix_res, image_res)

        # Update the model with the exported paths
        # and set the status as done
        logger.info(
            f"PATH MATRIX : {confusion_matrix_path}\n\nPATH SHAP : {shap_model_path}\n\n\n\n"
        )
        model.pickled_model_path = str(pickled_model_path)
        model.exported_model_path = str(exported_model_path)
        if config.model_type == "classification":
            model.confusion_matrix_path = str(confusion_matrix_path)
        model.shap_model_path = str(shap_model_path)
        model.analysis = analysis
        model.status = "done"
        dataset.save()
    except Exception as e:
        logger.error(f"Got error while training: {e}")
        traceback.print_exc()
        set_status("error")
 def delete_dataset(id):
     dataset = Dataset.from_id(id)
     dataset.delete_data()
     dataset.delete()
     return {}
 def route_get_dataset(id):
     dataset = Dataset.from_id(id)
     return dataset.to_json()
 def get_dataset_config(id):
     config, _ = Dataset.config_from_id(id)
     return config.to_json()
 def get_dataset_visualization(id):
     d = Dataset.from_id(id)
     path = dataset.get_dataset_visualization(Path(d.path), d)
     return send_file(path)
 def lint_config_from_db(id):
     config, dataset = Dataset.config_from_id(id)
     return lint_config(config, dataset)
 def get_config_visualization(id):
     config, d = Dataset.config_from_id(id)
     path = dataset.get_dataset_visualization(Path(d.path), d, config)
     return send_file(path)
 def export_shap_value(id):
     model, _, _ = Dataset.model_from_id(id)
     if model.status != "done":
         return {"error": "Model is not trained"}, 409
     app.logger.info(f"ICI : {model.shap_model_path}\n\n\n\n")
     return send_file(model.shap_model_path, as_attachment=True)
 def get_model(id):
     model, config, dataset = Dataset.model_from_id(id)
     return model.to_json()
 def export_pickle(id):
     model, _, _ = Dataset.model_from_id(id)
     if model.status != "done":
         return {"error": "Model is not trained"}, 409
     return send_file(model.pickled_model_path, as_attachment=True)