예제 #1
0
def test_dataset_to_json():
    files = DatasetManager.list_datasets()
    for file in files:
        id = file['id']
        dataset = DatasetManager.get_dataset(id)
        assert dataset is not None
        payload = json.dumps(dataset.get_payload())
        '''
        with open(f'{id}.json', 'w') as f:
            f.write(payload)
        '''

        assert is_json(payload)
예제 #2
0
def test_query2dataset():
    payload = {
        "source_dataset_id": "iris",
        "query_type": "sql",
        "query": "SELECT * FROM dataset LIMIT 20;",
        "dataset_id": "query2dataset",
        "dataset_name": "query2dataset",
        "dataset_description": "test query to dataset",
    }

    try:
        DatasetManager.query2dataset(**payload)
        DatasetManager.delete_dataset(payload['dataset_id'])
    except Exception:
        pytest.fail('query to dataset should not raise error')
예제 #3
0
def test_adddataset():
    encoded_data = base64.b64encode(b'A,B,C,D\n1,2,3,4')
    payload = {
        "id": "b64test",
        "name": "b64test",
        "payload": encoded_data,
        "description": "b64test dataset",
    }

    try:
        DatasetManager.add_dataset(payload)

        DatasetManager.delete_dataset(payload['id'])
    except Exception:
        pytest.fail('should not raise execption')
예제 #4
0
 def predict(job_id, payload):
     data = payload['data']
     input_type = payload['input_type']
     try:
         model = MLJob.get_model(job_id)
         if input_type == 'csv':
             csv_data = BytesIO(base64.b64decode(data))
             df = pd.read_csv(csv_data, sep=",")
             df_prediction = model.predict(df)
             output_data = df_prediction.to_csv(index=False)
             result = {}
             result['data'] = base64.b64encode(output_data.encode('utf-8'))
             return result
         elif input_type == 'dataset':
             dataset = DatasetManager.get_dataset(data)
             df = dataset.get_df()
             df_prediction = model.predict(df)
             payload = {}
             payload["cols"], payload["rows"] = df_to_cols_rows(
                 df_prediction)
             return payload
         else:
             message = f'input type {input_type} is not supported for prediction'
             logger.error(message)
             raise RuntimeError(message)
     except Exception as e:
         logger.exception(
             f'failed to do prediction for data={data} id={job_id} error={e}'
         )
         raise e
예제 #5
0
def test_job_auto_regression():
    dataset_id = 'housing'
    dataset = DatasetManager.get_dataset(dataset_id)
    assert dataset is not None
    df = dataset.get_df()
    assert df is not None

    features = [
        'crime_rate',
        'business_acres',
        'avg_rooms_per_dwelling',
        'distance_to_employment_center',
    ]
    targets = ['median_house_value']

    job_option = {}
    job_option['time_left_for_this_task'] = 30
    job_option['per_run_time_limit'] = 10

    job = AutoRegressionJob('testregression', dataset_id, features, targets,
                            job_option, None)
    job.train()

    predict_result = job.predict(df[features])
    predict_result[targets] = df[targets]
    assert job.get_status() == MLJobStatus.SUCCESS
    # predict_result.to_csv('/tmp/regression.csv', encoding='utf-8')
    job.clean()
예제 #6
0
def test_sqlquery():
    dataset = DatasetManager.get_dataset('iris')
    query_result = dataset.query('SELECT * FROM dataset LIMIT 10;', 'sql')
    assert query_result is not None

    cols, rows = df_to_cols_rows(query_result)
    assert rows is not None
    assert len(rows) == 10
예제 #7
0
 def __init__(self, name, dataset):
     self.id = str(uuid.uuid4())
     self.name = name
     self.dataset_id = dataset
     self.dataset = DatasetManager.get_dataset(dataset)
     self.df = self.dataset.get_df()
     self.job_dir = os.path.join(MLJob.base_dir, self.id)
     self.metadata = {}
     self._init()
def test_job_time_serials():
    dataset_id = 'air_passengers'
    dataset = DatasetManager.get_dataset(dataset_id)
    assert dataset is not None
    df = dataset.get_df()
    assert df is not None

    features = ['Date']
    targets = ['Number']

    job_option = {}

    job = TimeSerialsForecastsJob('testtimeserials', dataset_id, features,
                                  targets, job_option)
    job.train()
    if hasattr(job, 'training_error'):
        print(f'training error was detected {job.training_error}')

    assert job.get_status() == MLJobStatus.SUCCESS
    predict_result = job.predict(df[features])
    assert predict_result is not None
    predict_result.to_csv('/tmp/tt.csv', encoding='utf-8')
예제 #9
0
def test_job_auto_multi_classification():
    dataset_id = 'iris'
    dataset = DatasetManager.get_dataset(dataset_id)
    assert dataset is not None
    df = dataset.get_df()
    assert df is not None

    features = ['sepal_length', 'sepal_width']
    targets = ['species']

    job_option = {}
    job_option['time_left_for_this_task'] = 30
    job_option['per_run_time_limit'] = 10

    job = AutoClassificationJob('testclassification', dataset_id, features,
                                targets, job_option, None)
    job.train()

    predict_result = job.predict(df[features])
    predict_result[targets] = df[targets]
    assert job.get_status() == MLJobStatus.SUCCESS
    # predict_result.to_csv('/tmp/classification.csv', encoding='utf-8')
    job.clean()
예제 #10
0
def test_job_auto_classification():
    dataset_id = 'churn'
    dataset = DatasetManager.get_dataset(dataset_id)
    assert dataset is not None
    df = dataset.get_df()
    assert df is not None

    features = ['Account Length', 'Area Code', 'Day Calls', 'State']
    targets = ['Churn?']

    job_option = {}
    job_option['time_left_for_this_task'] = 30
    job_option['per_run_time_limit'] = 10

    job = AutoClassificationJob('testclassification', dataset_id, features,
                                targets, job_option, None)
    job.train()

    predict_result = job.predict(df[features])
    predict_result[targets] = df[targets]
    assert job.get_status() == MLJobStatus.SUCCESS
    # predict_result.to_csv('/tmp/classification.csv', encoding='utf-8')
    job.clean()
예제 #11
0
def test_query():
    dataset = DatasetManager.get_dataset('iris')
    query_result = dataset.query('sepal_length > sepal_width')
    assert query_result is not None
예제 #12
0
def test_data_list():
    files = DatasetManager.list_datasets()
    assert len(files) == 5
예제 #13
0
def test_data_list():
    DatasetManager.list_datasets()