def load_data_problem(inputdir, problempath): print("Reading ", inputdir) print("Reading ", problempath) with open(problempath) as file: problem_schema = json.load(file) #filename = "scores.csv" #with open(filename, "a") as g: # g.write(inputdir + "\n") datasetId = problempath[:-29] dataset_schema = datasetId + "dataset_TRAIN/datasetDoc.json" problem_doc_metadata = Metadata(problem_schema) dataset_uri = 'file://{dataset_uri}'.format(dataset_uri=dataset_schema) dataset = D3MDatasetLoader().load(dataset_uri) problem_description = problem.parse_problem_description(problempath) dataset = add_target_columns_metadata(dataset, problem_description) dataset = add_privileged_columns_metadata(dataset, problem_description) taskname = get_task_name(problem_doc_metadata.query(())['about']['taskKeywords']) metric = problem_doc_metadata.query(())['inputs']['performanceMetrics'][0]['metric'] posLabel = None if metric == "f1": posLabel = problem_doc_metadata.query(())['inputs']['performanceMetrics'][0]['posLabel'] # Read the data augmentation keywords = getAugmentation_keywords(problem_doc_metadata) return (dataset, taskname, problem_description, metric, posLabel, keywords)
def load_problem_doc(problem_doc_uri: str): """ Load problem_doc from problem_doc_uri Parameters --------- problem_doc_uri Uri where the problemDoc.json is located """ with open(problem_doc_uri) as file: problem_doc = json.load(file) problem_doc_metadata = Metadata(problem_doc) return problem_doc_metadata
def load_data_problem(inputdir, problempath): print("Reading ", inputdir) print("Reading ", problempath) with open(problempath) as file: problem_schema = json.load(file) datasetId = problempath[:-29] dataset_schema = datasetId + "dataset_TRAIN/datasetDoc.json" problem_doc_metadata = Metadata(problem_schema) dataset_uri = 'file://{dataset_uri}'.format(dataset_uri=dataset_schema) dataset = D3MDatasetLoader().load(dataset_uri) problem_description = problem.parse_problem_description(problempath) dataset = add_target_columns_metadata(dataset, problem_description) taskname = problem_doc_metadata.query(())['about']['taskType'] return (dataset, taskname, problem_description)
def load_problem_doc(problem_doc_path: str) -> Metadata: """ Load problem_doc from problem_doc_path Paramters --------- problem_doc_path Path where the problemDoc.json is located """ with open(problem_doc_path) as file: problem_doc = json.load(file) return Metadata(problem_doc)
def load_data(data_path, problem_path) -> tuple: ''' load dataset metadata ''' dataset = D3MDatasetLoader() if "file:" not in data_path: data_path = 'file://{dataset_path}'.format( dataset_path=os.path.abspath(data_path)) with open(problem_path) as f: problem_doc = json.load(f) problem = Metadata(problem_doc) dataset = dataset.load(dataset_uri=data_path) dataset = add_target_columns_metadata(dataset, problem) return dataset, problem
def convert_problem_doc_proto_to_metadata(problem_doc): # Convert problem_doc proto to json json_problem_doc = {} # General info json_problem_doc["about"] = { "problemID": problem_doc.problem.id, "problemVersion": problem_doc.problem.version, "problemName": problem_doc.problem.name, "taskType": problem_doc.problem.task_type, "taskSubType": problem_doc.problem.task_subtype, "problemSchemaVersion": "3.1.1", "problemDescription": problem_doc.problem.description } # Set inputs inputs_data = [] for input_proto in problem_doc.inputs: targets_data = [] for target_data in input_proto.targets: targets_data.append({ "targetIndex": target_data.target_index, "resID": target_data.resource_id, "colIndex": target_data.column_index, "colName": target_data.column_name, }) inputs_data.append({ "datasetID": input_proto.dataset_id, "targets": targets_data }) # Set metrics perf_metrics = [] for performance_metric in problem_doc.problem.performance_metrics: print(performance_metric.metric) reverse_map = {v: k for k, v in PerformanceMetric.get_map().items()} metric_string = reverse_map[PerformanceMetric( performance_metric.metric)] perf_metrics.append({"metric": metric_string}) json_problem_doc["inputs"] = { "data": inputs_data, "performanceMetrics": perf_metrics } # From json use standard methods to parse to metadata return Metadata(json_problem_doc)
def load_test_dataset_for_pipeline(config_path) -> tuple: ''' load and return test_dataset and test_problem given by configfile: test_config.json ''' test_config_path = os.path.join(config_path, "test_config.json") with open(test_config_path, "r") as f: test_config = json.load(f) data_path = test_config["dataset_schema"] problem_path = test_config["problem_schema"] dataset = D3MDatasetLoader() if "file:" not in data_path: data_path = 'file://{dataset_path}'.format( dataset_path=os.path.abspath(data_path)) with open(problem_path) as f: problem_doc = json.load(f) problem = Metadata(problem_doc) dataset = dataset.load(dataset_uri=data_path) dataset = add_target_columns_metadata(dataset, problem) return dataset, problem
def metadata_to_str(metadata: base.Metadata, selector: base.Selector = None) -> str: buf = io.StringIO() metadata.pretty_print(selector, buf) return buf.getvalue()