示例#1
0
def createPipeline(port=None,
                   session=None,
                   data=None,
                   predictor=None,
                   response=None,
                   task_type=None,
                   task_subtype=None,
                   output_type=None,
                   metric=None):
    stub = get_stub(int(port))

    data_uri = 'file://%s' % (data)

    predictor = json.loads(predictor)
    response = json.loads(response)

    resp = stub.CreatePipelines(
        cpb.PipelineCreateRequest(
            context=Parse(session, cpb.SessionContext()),
            train_features=[
                cpb.Feature(feature_id=pred, data_uri=data_uri)
                for pred in predictor
            ],
            target_features=[
                cpb.Feature(feature_id=targ, data_uri=data_uri)
                for targ in response
            ],
            task=cpb.TaskType.Value(task_type.upper()),
            task_subtype=cpb.TaskSubtype.Value(toConstCase(task_subtype)),
            output=cpb.OutputType.Value(toConstCase(output_type)),
            metrics=[cpb.Metric.Value(toConstCase(metric))],
            task_description='TA2 pipeline creation',
            max_pipelines=5))

    return map(lambda x: json.loads(MessageToJson(x)), resp)
示例#2
0
def createPipeline(context=None,
                   data_uri=None,
                   task_type=None,
                   task_subtype=None,
                   target_features=None,
                   predict_features=[],
                   metrics=None,
                   max_pipelines=10):

    stub = get_stub()

    problem_schema_path = os.environ.get('PROBLEM_ROOT')
    problem_supply = d3mds.D3MProblem(problem_schema_path)

    # get the target features into the record format expected by the API
    targets = problem_supply.get_targets()
    features = []
    for entry in targets:
        tf = core_pb2.Feature(resource_id=entry['resID'],
                              feature_name=entry['colName'])
        features.append(tf)

    # we are having trouble parsing the problem specs into valid API specs, so just hardcode
    # to certain problem types for now.  We could fix this with a more general lookup table to return valid API codes
    task = taskTypeLookup(task_type)
    tasksubtype = subTaskLookup(task_subtype)

    # the metrics in the files are imprecise text versions of the enumerations, so just standardize.  A lookup table
    # would help here, too
    metrics = [
        core_pb2.F1_MICRO, core_pb2.ROC_AUC, core_pb2.ROOT_MEAN_SQUARED_ERROR,
        core_pb2.F1, core_pb2.R_SQUARED
    ]

    context_in = cpb.SessionContext(session_id=context)

    request_in = cpb.PipelineCreateRequest(
        context=context_in,
        dataset_uri=data_uri,
        task=task,
        task_subtype=tasksubtype,
        metrics=metrics,
        task_description='Modsquad pipeline create request',
        target_features=features,
        predict_features=[],
        max_pipelines=10)
    resp = stub.CreatePipelines(request_in)

    return map(lambda x: json.loads(MessageToJson(x)), resp)
示例#3
0
    def test_pipeline(self):
        "Tries setting up a new pipeline"
        channel = grpc.insecure_channel('localhost:45042')
        stub = core_pb2_grpc.CoreStub(channel)
        msg = core_pb2.SessionRequest(user_agent="unittest", version="Foo")
        session = stub.StartSession(msg)
        self.assertTrue(session.response_info.status.code == core_pb2.OK)

        pipeline_request = core_pb2.PipelineCreateRequest(
            context=session.context,
            dataset_uri=
            "file:///home/sheath/projects/D3M/cmu-ta3/test-data/185_baseball/TRAIN/dataset_TRAIN/datasetDoc.json",
            task=core_pb2.TASK_TYPE_UNDEFINED,
            task_subtype=core_pb2.TASK_SUBTYPE_UNDEFINED,
            task_description="",
            output=core_pb2.OUTPUT_TYPE_UNDEFINED,
            metrics=[],
            target_features=[],
            predict_features=[],
            max_pipelines=10)
        p = stub.CreatePipelines(pipeline_request)
        for response in p:
            self.assertTrue(response.response_info.status.code == core_pb2.OK)
示例#4
0
def pipeline_create_parse():
    session_context = core_pb2.SessionContext()
    session_context.session_id = 'abc123'

    req = core_pb2.PipelineCreateRequest()
    req.context.session_id = 'session_0'

    req.train_features.add(
        feature_id='cylinders',
        data_uri='data/d3m/o_196seed/data/trainDatamerged.tsv')

    req.train_features.add(
        feature_id='cylinders',
        data_uri='data/d3m/o_196seed/data/trainDatamerged.tsv')

    req.task = core_pb2.REGRESSION

    req.task_subtype = core_pb2.UNIVARIATE

    req.output = core_pb2.REAL

    req.metrics.append(core_pb2.ROOT_MEAN_SQUARED_ERROR)

    req.target_features.add(
        feature_id='class',
        data_uri='data/d3m/o_196seed/data/trainDatamerged.tsv')

    req.max_pipelines = 10

    msg_and_back(req, core_pb2.PipelineCreateRequest)

    print('-' * 40)
    content = MessageToJson(req, including_default_value_fields=True)
    print(content)
    print('-' * 40)
    json_parse(content, core_pb2.PipelineCreateRequest)
    print('-' * 40)
示例#5
0
def run():
    channel = grpc.insecure_channel('localhost:45042')
    stub = crpc.CoreStub(channel)
    dstub = drpc.DataExtStub(channel)
    dfstub = dfrpc.DataflowExtStub(channel)

    # Start Session
    session_response = stub.StartSession(
        core.SessionRequest(user_agent="xxx", version="1.0"))
    session_context = session_response.context
    print("Session started (%s)" % str(session_context.session_id))

    # Send pipeline creation request
    dataset_uri = "file:///tmp/data/185_baseball/185_baseball_dataset/datasetDoc.json"
    some_features = [
        core.Feature(resource_id="0", feature_name="d3mIndex"),
        core.Feature(resource_id="0", feature_name="Games_played"),
        core.Feature(resource_id="0", feature_name="Runs"),
        core.Feature(resource_id="0", feature_name="Hits"),
        core.Feature(resource_id="0", feature_name="Home_runs")
    ]
    target_features = [
        core.Feature(resource_id="0", feature_name="Hall_of_Fame")
    ]
    task = core.TaskType.Value('CLASSIFICATION')
    task_subtype = core.TaskSubtype.Value('MULTICLASS')
    task_description = "Classify Hall of Fame"
    output = core.OutputType.Value('OUTPUT_TYPE_UNDEFINED')
    metrics = [
        core.PerformanceMetric.Value('F1_MICRO'),
        core.PerformanceMetric.Value('F1_MACRO')
    ]
    max_pipelines = 10

    pipeline_ids = []

    print("Training with some features")
    pc_request = core.PipelineCreateRequest(context=session_context,
                                            dataset_uri=dataset_uri,
                                            predict_features=some_features,
                                            task=task,
                                            task_subtype=task_subtype,
                                            task_description=task_description,
                                            output=output,
                                            metrics=metrics,
                                            target_features=target_features,
                                            max_pipelines=max_pipelines)
    '''
    # Iterate over results
    for pcr in stub.CreatePipelines(pc_request):
        print(str(pcr))
        if len(pcr.pipeline_info.scores) > 0:
            pipeline_ids.append(pcr.pipeline_id)

    print("Training with some features")
    pc_request = core.PipelineCreateRequest(
        context = session_context,
        train_features = some_features,
        task = task,
        task_subtype = task_subtype,
        task_description = task_description,
        output = output,
        metrics = metrics,
        target_features = target_features,
        max_pipelines = max_pipelines
    )
    '''

    result = stub.CreatePipelines(pc_request)

    # Iterate over results
    for pcr in result:
        print(str(pcr))
        '''
        for gdr in dfstub.GetDataflowResults(dfext.PipelineReference(context = session_context,
                pipeline_id = pcr.pipeline_id)):
            print(gdr)
        '''
        if len(pcr.pipeline_info.scores) > 0:
            pipeline_id = pcr.pipeline_id
            pipeline_ids.append(pipeline_id)
            dflow = dfstub.DescribeDataflow(
                dfext.PipelineReference(context=session_context,
                                        pipeline_id=pipeline_id))
            print(dflow)

            exres = stub.ExportPipeline(
                core.PipelineExportRequest(
                    context=session_context,
                    pipeline_id=pipeline_id,
                    pipeline_exec_uri="file:///tmp/{}".format(pipeline_id)))
            print(exres)
            '''
            if pcr.pipeline_info.predict_result_uri is not None:
                df = pandas.read_csv(pcr.pipeline_info.predict_result_uri, index_col="d3mIndex")
                print(df)
            '''

    print("************** Executing/Testing Pipelines")

    # Execute pipelines
    for pipeline_id in pipeline_ids:
        print("Executing Pipeline %s" % pipeline_id)
        ep_request = core.PipelineExecuteRequest(context=session_context,
                                                 pipeline_id=pipeline_id,
                                                 dataset_uri=dataset_uri)
        for ecr in stub.ExecutePipeline(ep_request):
            print(str(ecr))
            if ecr.result_uri is not None:
                df = pandas.read_csv(ecr.result_uri, index_col="d3mIndex")
                print(df)

    list_request = core.PipelineListRequest(context=session_context)
    lrr = stub.ListPipelines(list_request)
    print(lrr.pipeline_ids)

    print("************** Cached pipeline create results")
    pcrr = core.PipelineCreateResultsRequest(context=session_context,
                                             pipeline_ids=lrr.pipeline_ids)
    for gcpr in stub.GetCreatePipelineResults(pcrr):
        print(str(gcpr))

    print("************** Cached pipeline execute results")
    perr = core.PipelineExecuteResultsRequest(context=session_context,
                                              pipeline_ids=lrr.pipeline_ids)
    for gepr in stub.GetExecutePipelineResults(perr):
        print(str(gepr))

    print("*********** Updating Metric to Accuracy.. Create pipelines again")
    metric = core.PerformanceMetric.Value('ACCURACY')
    ups_request = core.SetProblemDocRequest(
        context=session_context,
        updates=[
            core.SetProblemDocRequest.ReplaceProblemDocField(metric=metric)
        ])

    print(stub.SetProblemDoc(ups_request))
    print("********** Re-running pipeline creation")
    for pcr in stub.CreatePipelines(
            core.PipelineCreateRequest(context=session_context)):
        print(str(pcr))

    stub.EndSession(session_context)
示例#6
0
def pipeline_create(info_str=None):
    """Send the pipeline create request via gRPC"""
    if info_str is None:
        info_str = get_test_info_str()

    if info_str is None:
        err_msg = 'UI Str for %s is None' % PIPELINE_CREATE_REQUEST
        return get_failed_precondition_response(err_msg)

    # --------------------------------
    # Convert info string to dict
    # --------------------------------
    try:
        info_dict = json.loads(info_str, object_pairs_hook=OrderedDict)
    except json.decoder.JSONDecodeError as err_obj:
        err_msg = 'Failed to convert UI Str to JSON: %s' % (err_obj)
        return get_failed_precondition_response(err_msg)

    if KEY_CONTEXT_FROM_UI not in info_dict:
        return get_failed_precondition_response(ERR_NO_CONTEXT)

    if KEY_SESSION_ID_FROM_UI not in info_dict[KEY_CONTEXT_FROM_UI]:
        return get_failed_precondition_response(ERR_NO_SESSION_ID)

    # --------------------------------
    # convert the JSON string to a gRPC request
    # --------------------------------
    try:
        req = Parse(info_str, core_pb2.PipelineCreateRequest())
    except ParseError as err_obj:
        err_msg = 'Failed to convert JSON to gRPC: %s' % (err_obj)
        return get_failed_precondition_response(err_msg)

    if settings.TA2_STATIC_TEST_MODE:

        template_info = get_predict_file_info_dict(info_dict.get('task'))

        template_str = get_grpc_test_json('test_responses/createpipeline_ok.json',
                                          template_info)

        # These next lines embed file uri content into the JSON
        embed_util = FileEmbedUtil(template_str)
        if embed_util.has_error:
            return get_failed_precondition_response(embed_util.error_message)

        return embed_util.get_final_results()
        #return get_grpc_test_json('test_responses/createpipeline_ok.json',
        #                          template_info)

    # --------------------------------
    # Get the connection, return an error if there are channel issues
    # --------------------------------
    core_stub, err_msg = TA2Connection.get_grpc_stub()
    if err_msg:
        return get_failed_precondition_response(err_msg)

    # --------------------------------
    # Send the gRPC request
    # --------------------------------
    messages = []

    try:
        for reply in core_stub.CreatePipelines(req):
            user_msg = MessageToJson(reply)
            print(user_msg)
            messages.append(user_msg)
    except Exception as ex:
        return get_reply_exception_response(str(ex))

    print('end of queue. make message list')

    result_str = '['+', '.join(messages)+']'

    print('embed file contents')
    embed_util = FileEmbedUtil(result_str)
    if embed_util.has_error:
        print('file embed error')
        return get_failed_precondition_response(embed_util.error_message)

    print('return results')
    return embed_util.get_final_results()