예제 #1
0
def load_pipeline(pipeline):
    with open(pipeline) as _pipeline:
        if pipeline.endswith('.json'):
            pipeline = Pipeline.from_json(_pipeline)
        else:
            pipeline = Pipeline.from_yaml(_pipeline)

    return pipeline
예제 #2
0
def generate_pipeline(pipeline_path: str,
                      dataset_path: str,
                      problem_doc_path: str,
                      resolver: Resolver = None) -> Runtime:
    """
    Simplified interface that fit a pipeline with a dataset

    Paramters
    ---------
    pipeline_path
        Path to the pipeline description
    dataset_path:
        Path to the datasetDoc.json
    problem_doc_path:
        Path to the problemDoc.json
    resolver : Resolver
        Resolver to use.
    """

    # Pipeline description
    pipeline_description = None
    if '.json' in pipeline_path:
        with open(pipeline_path) as pipeline_file:
            pipeline_description = Pipeline.from_json(
                string_or_file=pipeline_file, resolver=resolver)
    else:
        with open(pipeline_path) as pipeline_file:
            pipeline_description = Pipeline.from_yaml(
                string_or_file=pipeline_file, resolver=resolver)

    # Problem Doc
    problem_doc = load_problem_doc(problem_doc_path)

    # Dataset
    if 'file:' not in dataset_path:
        dataset_path = 'file://{dataset_path}'.format(
            dataset_path=os.path.abspath(dataset_path))

    dataset = D3MDatasetLoader().load(dataset_uri=dataset_path)
    # Adding Metadata to Dataset
    dataset = add_target_columns_metadata(dataset, problem_doc)

    # Pipeline
    pipeline_runtime = Runtime(pipeline_description)
    # Fitting Pipeline
    pipeline_runtime.fit(inputs=[dataset])
    return pipeline_runtime
예제 #3
0
def load_pipeline(pipeline_path):
    with open(pipeline_path, 'r') as pipeline_file:
        if pipeline_path.endswith('.json'):
            return Pipeline.from_json(pipeline_file)
        else:
            return Pipeline.from_yaml(pipeline_file)
예제 #4
0
    def test_convert_openml_task(self):
        self.maxDiff = None

        with open(
                os.path.join(
                    os.path.join(PIPELINE_DIR,
                                 'data-preparation-train-test-split.yml')),
                'r') as data_pipeline_file:
            data_pipeline = Pipeline.from_yaml(data_pipeline_file,
                                               resolver=Resolver())
        data_params = {
            'train_score_ratio': '0.8',
            'shuffle': 'true',
            'stratified': 'true',
        }
        task_id = 8
        save_dir = os.path.join(self.test_dir, 'single_dataset')
        save_dir_path = pathlib.PurePath(save_dir)

        datasets = {}
        crawler.crawl_openml_task(
            datasets=datasets,
            task_id=task_id,
            save_dir=save_dir,
            data_pipeline=data_pipeline,
            data_params=data_params,
            context=metadata_base.Context.TESTING,
        )
        self.assertEqual(
            datasets, {
                'openml_dataset_8':
                str(save_dir_path /
                    pathlib.PurePosixPath('openml_dataset_8/datasetDoc.json')),
                'openml_dataset_8_TRAIN':
                str(save_dir_path / pathlib.PurePosixPath(
                    'TRAIN/dataset_TRAIN/datasetDoc.json')),
                'openml_dataset_8_TEST':
                str(save_dir_path /
                    pathlib.PurePosixPath('TEST/dataset_TEST/datasetDoc.json')
                    ),
                'openml_dataset_8_SCORE':
                str(save_dir_path / pathlib.PurePosixPath(
                    'SCORE/dataset_SCORE/datasetDoc.json')),
            })

        self._assert_dir_structure(save_dir, [
            'SCORE',
            'TEST',
            'TRAIN',
            'openml_dataset_8',
            'openml_problem_8',
            'data_preparation_pipeline_run.pkl',
            'SCORE/dataset_SCORE',
            'SCORE/problem_SCORE',
            'SCORE/dataset_SCORE/tables',
            'SCORE/dataset_SCORE/datasetDoc.json',
            'SCORE/dataset_SCORE/tables/learningData.csv',
            'SCORE/problem_SCORE/problemDoc.json',
            'TEST/dataset_TEST',
            'TEST/problem_TEST',
            'TEST/dataset_TEST/tables',
            'TEST/dataset_TEST/datasetDoc.json',
            'TEST/dataset_TEST/tables/learningData.csv',
            'TEST/problem_TEST/problemDoc.json',
            'TRAIN/dataset_TRAIN',
            'TRAIN/problem_TRAIN',
            'TRAIN/dataset_TRAIN/tables',
            'TRAIN/dataset_TRAIN/datasetDoc.json',
            'TRAIN/dataset_TRAIN/tables/learningData.csv',
            'TRAIN/problem_TRAIN/problemDoc.json',
            'openml_dataset_8/tables',
            'openml_dataset_8/datasetDoc.json',
            'openml_dataset_8/tables/learningData.csv',
            'openml_problem_8/problemDoc.json',
        ])
예제 #5
0
    def test_ignore_openml_task(self):
        self.maxDiff = None

        with open(
                os.path.join(
                    os.path.join(PIPELINE_DIR,
                                 'data-preparation-train-test-split.yml')),
                'r') as data_pipeline_file:
            data_pipeline = Pipeline.from_yaml(data_pipeline_file,
                                               resolver=Resolver())
        data_params = {
            'train_score_ratio': '0.8',
            'shuffle': 'true',
            'stratified': 'true',
        }
        save_dir = os.path.join(self.test_dir, 'ignore_dataset')
        max_tasks = 1
        has_errored = crawler.crawl_openml(
            save_dir=save_dir,
            task_types=(
                problem_module.OpenMLTaskType.SUPERVISED_CLASSIFICATION, ),
            data_pipeline=data_pipeline,
            data_params=data_params,
            context=metadata_base.Context.TESTING,
            max_tasks=max_tasks,
            ignore_tasks=[3],
            ignore_datasets=[2],
        )
        self.assertFalse(has_errored)

        self._assert_dir_structure(save_dir, [
            'openml_task_4',
            'openml_task_4/SCORE',
            'openml_task_4/TEST',
            'openml_task_4/TRAIN',
            'openml_task_4/openml_dataset_4',
            'openml_task_4/openml_problem_4',
            'openml_task_4/data_preparation_pipeline_run.pkl',
            'openml_task_4/SCORE/dataset_SCORE',
            'openml_task_4/SCORE/problem_SCORE',
            'openml_task_4/SCORE/dataset_SCORE/tables',
            'openml_task_4/SCORE/dataset_SCORE/datasetDoc.json',
            'openml_task_4/SCORE/dataset_SCORE/tables/learningData.csv',
            'openml_task_4/SCORE/problem_SCORE/problemDoc.json',
            'openml_task_4/TEST/dataset_TEST',
            'openml_task_4/TEST/problem_TEST',
            'openml_task_4/TEST/dataset_TEST/tables',
            'openml_task_4/TEST/dataset_TEST/datasetDoc.json',
            'openml_task_4/TEST/dataset_TEST/tables/learningData.csv',
            'openml_task_4/TEST/problem_TEST/problemDoc.json',
            'openml_task_4/TRAIN/dataset_TRAIN',
            'openml_task_4/TRAIN/problem_TRAIN',
            'openml_task_4/TRAIN/dataset_TRAIN/tables',
            'openml_task_4/TRAIN/dataset_TRAIN/datasetDoc.json',
            'openml_task_4/TRAIN/dataset_TRAIN/tables/learningData.csv',
            'openml_task_4/TRAIN/problem_TRAIN/problemDoc.json',
            'openml_task_4/openml_dataset_4/tables',
            'openml_task_4/openml_dataset_4/datasetDoc.json',
            'openml_task_4/openml_dataset_4/tables/learningData.csv',
            'openml_task_4/openml_problem_4/problemDoc.json',
        ])
예제 #6
0
import random
import d3m.metadata.base
import d3m.runtime
from sqlalchemy.orm import joinedload
from d3m.container import Dataset
from d3m_ta2_nyu.workflow import database, convert
from d3m_ta2_nyu.utils import is_collection, get_dataset_sample
from d3m.metadata.pipeline import Pipeline
from d3m.metadata.problem import PerformanceMetric, TaskKeyword
from multiprocessing import Manager, Process

logger = logging.getLogger(__name__)

with pkg_resources.resource_stream(
        'd3m_ta2_nyu', '../resource/pipelines/kfold_tabular_split.yaml') as fp:
    kfold_tabular_split = Pipeline.from_yaml(fp)

with pkg_resources.resource_stream(
        'd3m_ta2_nyu',
        '../resource/pipelines/kfold_timeseries_split.yaml') as fp:
    kfold_timeseries_split = Pipeline.from_yaml(fp)

with pkg_resources.resource_stream(
        'd3m_ta2_nyu',
        '../resource/pipelines/train-test-tabular-split.yaml') as fp:
    train_test_tabular_split = Pipeline.from_yaml(fp)

with pkg_resources.resource_stream('d3m_ta2_nyu',
                                   '../resource/pipelines/scoring.yaml') as fp:
    scoring_pipeline = Pipeline.from_yaml(fp)
예제 #7
0
파일: schemas.py 프로젝트: tods-doc/axolotl
def get_scoring_pipeline() -> Pipeline:
    with open(SCORING_PIPELINES_DIR, 'r') as pipeline_file:
        with d3m_utils.silence():
            pipeline = Pipeline.from_yaml(pipeline_file)
    return pipeline