Пример #1
0
    def get(self, model_id):
        # get sepcific model

        tm = TrainedModel.query.filter_by(id=model_id).first()
        content_dict = row_to_dict(tm)
        return {
            'content': content_dict,
            'links': [
                {
                    "rel": "self",
                    "href": url_join(
                        base_url=request.base_url,
                        url=model_id
                    )
                },
                {
                    "rel": "file",
                    "href": url_join(
                        current_app.config['STORAGE_API'],
                        content_dict['file_id']
                    )
                }
            ]

        }
Пример #2
0
def dataset_get(dataset_id):
    # TODO: A lot of it should be replaced -> df = h.dataset_id_to_df(dataset_id)

    dataset_single_uri = h.url_join(current_app.config['DATASETS_API'],
                                    dataset_id)
    r = requests.get(dataset_single_uri)

    if int(r.status_code) != 200:
        flash(
            'Dataset API response code was {}, '
            'cannot fetch the dataset'.format(r.status_code), 'warning')
        return redirect(url_for('site_datasets.dataset_list'))

    json_data = json.loads(r.content)

    storage_binary_uri = h.hateoas_get_link(json_data, 'binary')

    label_url = h.url_join(request.url,
                           'al')  #h.hateoas_get_link(json_data, 'label')

    # TODO get storage delte uri from dataset
    delete_url = url_for('site_datasets.dataset_delete', dataset_id=dataset_id)

    r_storage = requests.get(storage_binary_uri)
    if r_storage.status_code != 200:
        flash(
            'Storage API response code was {}, cannot fetch the file'.format(
                r.status_code), 'warning')
        return redirect(url_for('site_datasets.dataset_list'))

    csv_content = r_storage.content

    try:
        df = pd.read_csv(io.StringIO(csv_content.decode('utf-8')))
    except EmptyDataError:
        flash('Could not make a dataset out of the storage file - it is empty',
              'warning')
        return redirect(url_for('site_datasets.dataset_list'))

    if df.empty:
        flash('Cannot show dataset - it is empty', 'warning')

    html_params = h.get_html_pagination_params(request.args, df)

    return render_template(
        'datasets/info.html',
        info=json_data,
        data=html_params['page_data'],
        pagination=html_params['pagination'],
        download_url=storage_binary_uri,
        label_url=label_url,
        delete_url=delete_url,
    )
Пример #3
0
    def get(self):
        storage_api_url = current_app.config['STORAGE_API']

        user_id = request.args.get('userid')
        project_id = request.args.get('projectid')
        query = Dataset.query
        if user_id:
            query = query.filter_by(user_id=user_id)
        if project_id:
            query = query.filter_by(project_id=project_id)
        datasets = query.all(
        )  # TODO: add admin and None if not logged in - security risk!

        content = [{
            'id':
            d.id,
            'file_id':
            d.file_id,
            'name':
            d.name,
            'description':
            d.description,
            'train':
            d.train,
            'test':
            d.test,
            'label':
            d.label,
            'user_id':
            d.user_id,
            'project_id':
            d.project_id,
            'links': [{
                "rel": "self",
                "href": url_join(request.base_url, d.id)
            }, {
                "rel": "file",
                "href": url_join(storage_api_url, d.file_id)
            }]
        } for d in datasets]

        self_href = (request.base_url +
                     '?user_id={}'.format(user_id) if user_id else '')

        return {
            'content': content,
            'links': [{
                "rel": "self",
                "href": self_href
            }]
        }
Пример #4
0
 def get(self, dataset_id):
     storage_api_url = current_app.config['STORAGE_API']
     # get sepcific model
     dataset = Dataset.query.filter_by(id=dataset_id).first()
     content = row_to_dict(dataset)
     return jsonify({
         'content':
         content,
         'links': [{
             "rel": "self",
             "href": url_join(request.base_url, dataset_id)
         }, {
             "rel": "file",
             "href": url_join(storage_api_url, content['file_id'])
         }]
     })
Пример #5
0
    def post(self, model_id):
        posted_file = request.get_data('content')
        csv = StringIO(posted_file.decode('utf-8'))
        df = pd.read_csv(csv, encoding='utf-8')
        tm = TrainedModel.query.filter_by(id=model_id).first()
        storage_api = current_app.config['STORAGE_API']
        r = requests.get('{}/{}'.format(storage_api, tm.file_id))
        pickled_model = r.content
        trained_model = pickle.loads(pickled_model)
        y = trained_model.predict(df.ix[:, 0])

        result = {'X': list(df.ix[:, 0].values), 'y': y.tolist()}

        self_href = url_join(
            base_url=request.base_url,
            url=url_for(
                'machine_learning_models.single',
                model_id=model_id
            )
        )
        return {
            'content': result,
            'links': [
                {
                    "rel": "self",
                    "href": self_href
                }
            ]
        }
Пример #6
0
    def get(self, dataset_id):
        # Get initial dataset
        dataset_object = Dataset.query.filter_by(id=dataset_id).first()

        input_field = dataset_object.input
        target_field = dataset_object.target
        # label_batch_field = dataset_object.label_batch # TODO: Implement label batch field
        batch_field_name = 'batch'

        storage_uri = dataset_object.binary_uri
        r_storage = requests.get(storage_uri)
        dataset = r_storage.content.decode('utf-8')
        csv = StringIO(dataset)
        df = pd.read_csv(csv)

        sample_size = min(len(df) // 10, 100)

        if batch_field_name not in df.columns:
            sample_size = min(sample_size, len(df))
            result_df = get_initial_label_set(
                df,
                input_field,
                n_clusters=sample_size,
                result_save_path=
                None,  # TODO: Needs to be temp and removed instantly
                batch_field=batch_field_name)
        else:
            unlabeled_rows = df[(df[target_field].apply(pd.isnull))]
            marked_unlabeled_rows = df[(df[batch_field_name] != 0)
                                       & ((df[target_field] == '') |
                                          (df[target_field].apply(pd.isnull)))]
            total_marked_unlabeled = len(marked_unlabeled_rows)

            marked_labels_missing = total_marked_unlabeled != 0
            all_labeled = len(unlabeled_rows) == 0
            if marked_labels_missing or all_labeled:
                result_df = df
            else:
                result_df = mark_for_labeling(df, batch_field_name,
                                              sample_size)

        result_csv = result_df.to_csv(index=False,
                                      encoding='utf-8').encode('utf-8')
        r = requests.put(dataset_object.storage_adapter_uri,
                         data=result_csv,
                         verify=False
                         # headers={'Content-Type': 'application/octet-stream'}
                         )

        self_href = url_join(request.base_url, dataset_id)

        return {
            'content': result_df.to_json(),
            'links': [{
                "rel": "self",
                "href": self_href
            }]
        }
Пример #7
0
    def get(self):
        # Get list of all models

        user_id = request.args.get('userid')
        project_id = request.args.get('projectid')

        query = TrainedModel.query
        if user_id:
            query = query.filter_by(user_id=user_id)
        if project_id:
            query = query.filter_by(project_id=project_id)
        trained_models = query.all()
        trained_models_dict = [
            row_to_dict(row)
            for row in trained_models
        ]

        for model_dict in trained_models_dict:
            model_dict['links'] = [
                {
                    "rel": "self",
                    "href": url_join(
                        base_url=request.base_url,
                        url=model_dict['id']
                    )
                },
                {
                    "rel": "file",
                    "href": url_join(
                        base_url=current_app.config['STORAGE_API'],
                        url=model_dict['file_id']
                    )
                }
            ]

        return {
            'content': trained_models_dict,
            'links': [
                {
                    "rel": "self",
                    "href": request.base_url
                }
            ]
        }
Пример #8
0
    def get(self, dataset_id):
        # storage_api_url = current_app.config['STORAGE_API']

        dataset = Dataset.query.filter_by(id=dataset_id).first()
        content = row_to_dict(dataset)  # TODO: get uri form DB, not id

        return {
            'id':
            dataset.id,
            'name':
            dataset.name,
            'description':
            dataset.description,
            'do_train':
            dataset.train,
            'do_test':
            dataset.test,
            'do_label':
            dataset.label,
            'user_id':
            dataset.user_id,
            'project_id':
            dataset.project_id,
            'links': [{
                "rel": "self",
                "href": url_join(request.base_url, dataset.id)
            }, {
                "rel": "storage",
                "href": dataset.storage_adapter_uri
            }, {
                "rel":
                "label",
                "href":
                url_join(request.url_root, 'api', 'v1', 'active-learning')
            }, {
                "rel": "binary",
                "href": get_binary_uri(dataset)
            }]
        }
Пример #9
0
def dataset_id_to_df(dataset_id):
    datasets_api = current_app.config['DATASETS_API']
    dataset_uri = h.url_join(datasets_api, dataset_id)
    r = requests.get(dataset_uri)
    r_dataset = r.json()

    storage_binary_uri = h.hateoas_get_link(r_dataset, 'binary')

    r_binary = requests.get(storage_binary_uri)

    # TODO: This is implementation for for text based problems only
    dataset = r_binary.content.decode('utf-8')
    csv = StringIO(dataset)
    return pd.read_csv(csv)
Пример #10
0
    def get(self, model_id):
        # get sepcific model

        tm = TrainedModel.query.filter_by(id=model_id).first()
        content_dict = row_to_dict(tm)

        return {
            'content':
            content_dict,
            'links': [{
                "rel": "self",
                "href": h.url_join(request.base_url, model_id)
            }]
        }
Пример #11
0
def dataset_delete(dataset_id):
    dataset_api_url = current_app.config['DATASETS_API']
    dataset_url = url_join(dataset_api_url, dataset_id)
    r = requests.get(dataset_url)

    json_data = json.loads(r.content)
    file_url = _get_link(json_data['links'], 'file')

    r_dataset = requests.delete(dataset_url)
    r_file = requests.delete(file_url)

    flash('Not yet implemented', 'info')

    return redirect(url_for('.dataset_get', dataset_id=dataset_id))
Пример #12
0
def get_available_fields(dataset_id):
    dataset_api_url = current_app.config['DATASETS_API']
    dataset_url = h.url_join(dataset_api_url, dataset_id)
    r = requests.get(dataset_url)
    content = json.loads(r.content)
    links = content['links']

    file_url = None
    for l in links:
        if l['rel'] == 'binary':
            file_url = l['href']
            break

    df = h.url_csv_to_df(file_url)

    return jsonify({c: c for c in df.columns})
Пример #13
0
def dataset_delete(dataset_id):
    dataset_api_url = current_app.config['DATASETS_API']
    dataset_url = h.url_join(dataset_api_url, dataset_id)
    r = requests.get(dataset_url)

    json_data = json.loads(r.content)

    storage_url = h.hateoas_get_link(json_data, 'storage')
    # file_url = _get_link(json_data['links'], 'file')  # TODO: get storage delete uri from dataset

    r_dataset = requests.delete(dataset_url)
    r_file = requests.delete(storage_url)

    flash('Not yet implemented', 'info')

    return redirect(url_for('.dataset_get', dataset_id=dataset_id))
Пример #14
0
def model(model_id):
    url = h.url_join(current_app.config['MODELS_API'], str(model_id))
    r = requests.get(url)

    json_data = json.loads(r.content)
    url = h.hateoas_get_link(json_data, 'file')  # TODO: fix this

    return '''
    <p>
    Id: {}</br>
    Name: {}</br>
    Description:<br/> 
    {}</br>
    <a href={}>link</a></p>
    '''.format(json_data['content']['id'], json_data['content']['name'],
               json_data['content']['description'], url)
Пример #15
0
def model(model_id):
    url = url_join(current_app.config['MODELS_API'], str(model_id))
    r = requests.get(url)

    json_data = json.loads(r.content)
    url = _get_link(json_data['links'], 'file')
    # url = api_url + '/api/v1/storage/{}'.format(json_data['file_id'])

    return '''
    <p>
    Id: {}</br>
    Name: {}</br>
    Description:<br/> 
    {}</br>
    <a href={}>link</a></p>
    '''.format(json_data['content']['id'], json_data['content']['name'],
               json_data['content']['description'], url)
Пример #16
0
    def post(self, model_id):
        """
        Predict using trained model with selected data and fields
        :param model_id: ID of the model used for prediction
        :return: HATEOAS-like dict, where content is the df as JSON
        """

        posted_json = request.get_json()
        df = dataset_id_to_df(posted_json['dataset_id'])
        x = df.get(posted_json['input_field'][0])

        trained_model = TrainedModel.query.filter_by(id=model_id).first()
        algorithm = algorithm_dict[trained_model.algorithm_id](
            storage_adapter_api=current_app.config['STORAGE_ADAPTER_API'])
        algorithm.resources = posted_json['resources']
        algorithm.load()
        result = algorithm.predict(x)

        for idx, class_col in enumerate(result.T):
            # Append column idx if more than one column
            predicted_column_name = ('predicted' + '_{}'.format(idx)
                                     if result.shape[1] > 1 else '')

            # If column name already exists, append unique ID
            predicted_column_name += (str(uuid4()) if predicted_column_name
                                      in df.keys() else '')
            df[predicted_column_name] = class_col

        self_href = h.url_join(request.base_url,
                               'machine_learning_models.single', model_id)

        return {
            'content': df.to_json(),
            'links': [{
                "rel": "self",
                "href": self_href
            }]
        }
Пример #17
0
def post_local(uri, bin_data):
    storage_id = requests.post(uri, bin_data).json()
    return h.url_join(uri, storage_id)
Пример #18
0
def predict():
    models_api_url = current_app.config['MODELS_API']

    params = {'userid': current_user.id}
    r = requests.get(models_api_url, params=params)

    models_response = r.json()

    model_file_url_dict = dict()
    model_choices = []
    for model_dict in models_response['content']:
        model_id = str(model_dict['id'])
        choice = (model_id, model_dict['name'])
        model_choices.append(choice)
        model_file_url_dict[model_id] = [
            l['href'] for l in model_dict['links'] if l['rel'] == 'self'
        ][0]

    form = PredictForm()

    form.model.choices = model_choices

    dataset_choices = get_user_datasets_choices(current_user.id)
    form.dataset.choices = dataset_choices

    if not dataset_choices or not model_choices:
        if not dataset_choices:
            flash('Upload some datasets first', 'warning')
        if not model_choices:
            flash('No models - try training some first', 'warning')
        return redirect(url_for('site_machine_learning.root'))

    if form.is_submitted():
        selected_dataset_value = form.dataset.data
    else:
        selected_dataset_value = dataset_choices[0][0]

    fields_dict = get_available_fields(selected_dataset_value).json.items()
    form.input.choices = fields_dict

    if form.validate_on_submit():
        # get dataset
        datasets_api_url = current_app.config['DATASETS_API']
        r = requests.get(url_join(datasets_api_url, selected_dataset_value))
        dataset_info = r.json()

        dataset_file_url = [
            l['href'] for l in dataset_info['links'] if l['rel'] == 'file'
        ][0]

        df = url_csv_to_df(dataset_file_url)
        input_df = df[[form.input.data]]

        # run model
        model_file_url = model_file_url_dict[form.model.data]
        r = requests.post(
            model_file_url,
            data=input_df.to_csv(index=False,
                                 encoding='utf-8').encode('utf-8'),
            headers={'Content-type': 'text/plain; charset=utf-8'})

        if r.status_code not in (200, 201):
            flash(
                'Prediction failed - model API call '
                'returned code: {}'.format(r.status_code), 'warning')
            return redirect(url_for('site_machine_learning.predict'))

        result = r.json()['content']
        result_df = DataFrame({'X': result['X'], 'y': result['y']})

        predicted_column_name = 'predicted'
        if predicted_column_name in df.keys():
            predicted_column_name += uuid4()
        df[predicted_column_name] = result_df['y']

        export_type = form.type.data
        if export_type == 'excel':
            output = io.BytesIO()
            writer = pd.ExcelWriter(output, engine='xlsxwriter')
            df.to_excel(writer, sheet_name='Sheet1', encoding='utf-8')
            writer.save()
            result = output.getvalue()
            mimetype = "application/vnd.ms-excel"
            filename = 'result.xlsx'
        elif export_type == 'csv':
            result = df.to_csv(encoding='utf-8')
            mimetype = "text/csv"
            filename = 'result.csv'
        else:
            flash('Nonexistent export type', 'danger')
            return redirect(url_for('.predict'))

        return Response(result,
                        mimetype=mimetype,
                        headers={
                            "Content-disposition":
                            "attachment; filename={}".format(filename)
                        })

    return render_template('ml/predict_form.html',
                           form=form,
                           url=url_for('site_machine_learning.predict'))
Пример #19
0
def mark_for_labeling(dataset_id):
    dataset_api = current_app.config['DATASETS_API']
    al_uri = h.url_join(dataset_api, dataset_id, 'al')
    return requests.get(al_uri)
Пример #20
0
def predict():
    models_api_url = current_app.config['MODELS_API']

    params = {'userid': current_user.id}
    r = requests.get(models_api_url, params=params)

    models_response = r.json()

    model_resources_dict = dict()
    model_choices = []
    for model_dict in models_response['content']:
        model_id = str(model_dict['id'])
        choice = (model_id, model_dict['name'])
        model_choices.append(choice)
        # TODO: rename to resources or so
        model_resources_dict[model_id] = model_dict['resources']

    form = PredictForm()

    form.model.choices = model_choices

    dataset_choices = get_user_datasets_choices(current_user.id)
    form.dataset.choices = dataset_choices

    if not dataset_choices or not model_choices:
        if not dataset_choices:
            flash('Upload some datasets first', 'warning')
        if not model_choices:
            flash('No models - try training some first', 'warning')
        return redirect(url_for('site_machine_learning.root'))

    if form.is_submitted():
        selected_dataset_value = form.dataset.data
    else:
        selected_dataset_value = dataset_choices[0][0]

    fields_dict = get_available_fields(selected_dataset_value).json.items()
    form.input.choices = fields_dict

    if form.validate_on_submit():
        post_json = {
            'dataset_id': form.dataset.data,
            'input_field': form.input.data,
            'user_id': current_user.id,
            'project_id': '',
            'resources': model_resources_dict[form.model.data]
        }

        r = requests.post(h.url_join(models_api_url, form.model.data),
                          json=post_json)

        if r.status_code not in (200, 201):
            flash(
                'Prediction failed - model API call '
                'returned code: {}'.format(r.status_code), 'warning')
            return redirect(url_for('site_machine_learning.predict'))

        result = r.json()['content']
        df = pd.read_json(result)

        export_type = form.type.data
        if export_type == 'excel':
            output = io.BytesIO()
            writer = pd.ExcelWriter(output, engine='xlsxwriter')
            df.to_excel(writer, sheet_name='Sheet1', encoding='utf-8')
            writer.save()
            result = output.getvalue()
            mimetype = "application/vnd.ms-excel"
            filename = 'result.xlsx'
        elif export_type == 'csv':
            result = df.to_csv(encoding='utf-8')
            mimetype = "text/csv"
            filename = 'result.csv'
        else:
            flash('Nonexistent export type', 'danger')
            return redirect(url_for('.predict'))

        return Response(result,
                        mimetype=mimetype,
                        headers={
                            "Content-disposition":
                            "attachment; filename={}".format(filename)
                        })

    return render_template('ml/predict_form.html',
                           form=form,
                           url=url_for('site_machine_learning.predict'))