示例#1
0
def load_data_problem(inputdir, problempath):
    print("Reading ", inputdir)
    print("Reading ", problempath)

    with open(problempath) as file:
        problem_schema =  json.load(file)

    #filename = "scores.csv"
    #with open(filename, "a") as g:
    #    g.write(inputdir + "\n")
 
    datasetId = problempath[:-29]
    dataset_schema = datasetId + "dataset_TRAIN/datasetDoc.json"
    problem_doc_metadata = Metadata(problem_schema)
    dataset_uri = 'file://{dataset_uri}'.format(dataset_uri=dataset_schema)
    dataset = D3MDatasetLoader().load(dataset_uri)

    problem_description = problem.parse_problem_description(problempath)
    dataset = add_target_columns_metadata(dataset, problem_description)
    dataset = add_privileged_columns_metadata(dataset, problem_description)
    taskname = get_task_name(problem_doc_metadata.query(())['about']['taskKeywords'])
    metric = problem_doc_metadata.query(())['inputs']['performanceMetrics'][0]['metric']
    posLabel = None
    if metric == "f1":
        posLabel = problem_doc_metadata.query(())['inputs']['performanceMetrics'][0]['posLabel']

    # Read the data augmentation
    keywords = getAugmentation_keywords(problem_doc_metadata)
  
    return (dataset, taskname, problem_description, metric, posLabel, keywords)
示例#2
0
def load_problem_doc(problem_doc_uri: str):
    """     
    Load problem_doc from problem_doc_uri     
    Parameters     ---------     
    problem_doc_uri     Uri where the problemDoc.json is located
    """
    with open(problem_doc_uri) as file:
        problem_doc = json.load(file)
    problem_doc_metadata = Metadata(problem_doc)
    return problem_doc_metadata
示例#3
0
def load_data_problem(inputdir, problempath):
    print("Reading ", inputdir)
    print("Reading ", problempath)

    with open(problempath) as file:
        problem_schema = json.load(file)

    datasetId = problempath[:-29]
    dataset_schema = datasetId + "dataset_TRAIN/datasetDoc.json"
    problem_doc_metadata = Metadata(problem_schema)
    dataset_uri = 'file://{dataset_uri}'.format(dataset_uri=dataset_schema)
    dataset = D3MDatasetLoader().load(dataset_uri)

    problem_description = problem.parse_problem_description(problempath)
    dataset = add_target_columns_metadata(dataset, problem_description)

    taskname = problem_doc_metadata.query(())['about']['taskType']

    return (dataset, taskname, problem_description)
示例#4
0
def load_problem_doc(problem_doc_path: str) -> Metadata:
    """
    Load problem_doc from problem_doc_path

    Paramters
    ---------
    problem_doc_path
        Path where the problemDoc.json is located
    """

    with open(problem_doc_path) as file:
        problem_doc = json.load(file)
    return Metadata(problem_doc)
示例#5
0
def load_data(data_path, problem_path) -> tuple:
    '''
    load dataset metadata
    '''
    dataset = D3MDatasetLoader()
    if "file:" not in data_path:
        data_path = 'file://{dataset_path}'.format(
            dataset_path=os.path.abspath(data_path))
    with open(problem_path) as f:
        problem_doc = json.load(f)
        problem = Metadata(problem_doc)
    dataset = dataset.load(dataset_uri=data_path)
    dataset = add_target_columns_metadata(dataset, problem)
    return dataset, problem
def convert_problem_doc_proto_to_metadata(problem_doc):
    # Convert problem_doc proto to json
    json_problem_doc = {}

    # General info
    json_problem_doc["about"] = {
        "problemID": problem_doc.problem.id,
        "problemVersion": problem_doc.problem.version,
        "problemName": problem_doc.problem.name,
        "taskType": problem_doc.problem.task_type,
        "taskSubType": problem_doc.problem.task_subtype,
        "problemSchemaVersion": "3.1.1",
        "problemDescription": problem_doc.problem.description
    }

    # Set inputs
    inputs_data = []
    for input_proto in problem_doc.inputs:
        targets_data = []
        for target_data in input_proto.targets:
            targets_data.append({
                "targetIndex": target_data.target_index,
                "resID": target_data.resource_id,
                "colIndex": target_data.column_index,
                "colName": target_data.column_name,
            })
        inputs_data.append({
            "datasetID": input_proto.dataset_id,
            "targets": targets_data
        })

    # Set metrics
    perf_metrics = []
    for performance_metric in problem_doc.problem.performance_metrics:
        print(performance_metric.metric)
        reverse_map = {v: k for k, v in PerformanceMetric.get_map().items()}
        metric_string = reverse_map[PerformanceMetric(
            performance_metric.metric)]
        perf_metrics.append({"metric": metric_string})
    json_problem_doc["inputs"] = {
        "data": inputs_data,
        "performanceMetrics": perf_metrics
    }

    # From json use standard methods to parse to metadata
    return Metadata(json_problem_doc)
示例#7
0
def load_test_dataset_for_pipeline(config_path) -> tuple:
    '''
    load and return test_dataset and test_problem given by configfile: test_config.json
    '''
    test_config_path = os.path.join(config_path, "test_config.json")
    with open(test_config_path, "r") as f:
        test_config = json.load(f)
        data_path = test_config["dataset_schema"]
        problem_path = test_config["problem_schema"]
    dataset = D3MDatasetLoader()
    if "file:" not in data_path:
        data_path = 'file://{dataset_path}'.format(
            dataset_path=os.path.abspath(data_path))
    with open(problem_path) as f:
        problem_doc = json.load(f)
        problem = Metadata(problem_doc)
    dataset = dataset.load(dataset_uri=data_path)
    dataset = add_target_columns_metadata(dataset, problem)
    return dataset, problem
示例#8
0
def metadata_to_str(metadata: base.Metadata,
                    selector: base.Selector = None) -> str:
    buf = io.StringIO()
    metadata.pretty_print(selector, buf)
    return buf.getvalue()