Пример #1
0
def get_table_count(name, namespace):
    from arch.api import eggroll
    eggroll.init("get_intersect_output", mode=1)
    table = eggroll.table(name, namespace)
    count = table.count()
    print("table count:{}".format(count))
    return count
Пример #2
0
    def setUp(self):
        eggroll.init("test_instance")
        dense_inst = []
        for i in range(100):
            inst = Instance(features=(i % 16 * np.ones(20)))
            dense_inst.append((i, inst))
        self.dense_table = eggroll.parallelize(dense_inst,
                                               include_key=True,
                                               partition=2)

        sparse_inst = []
        col_zero = []
        for i in range(100):
            indices = []
            data = []
            for j in range(20):
                val = ((i + 5)**3 + (j + 1)**4) % 16
                if val > 0:
                    indices.append(j)
                    data.append(val)
                if j == 0:
                    col_zero.append(val)
            sparse_vec = SparseVector(indices, data, 20)
            inst = Instance(features=sparse_vec)
            sparse_inst.append((i, inst))

        self.sparse_inst = sparse_inst
        self.sparse_table = eggroll.parallelize(sparse_inst,
                                                include_key=True,
                                                partition=1)
Пример #3
0
    def _init_argument(self):
        parser = argparse.ArgumentParser()
        parser.add_argument('-c',
                            '--config',
                            required=True,
                            type=str,
                            help="Specify a config json file path")
        parser.add_argument('-j',
                            '--job_id',
                            type=str,
                            required=True,
                            help="Specify the job id")
        # parser.add_argument('-p', '--party_id', type=str, required=True, help="Specify the party id")
        # parser.add_argument('-l', '--LOGGER_path', type=str, required=True, help="Specify the LOGGER path")
        args = parser.parse_args()
        config_path = args.config
        self.config_path = config_path
        if not args.config:
            LOGGER.error("Config File should be provided")
            exit(-100)
        self.job_id = args.job_id

        all_checker = AllChecker(config_path)
        all_checker.check_all()
        self._initialize(config_path)
        with open(config_path) as conf_f:
            runtime_json = json.load(conf_f)
        eggroll.init(self.job_id, self.workflow_param.work_mode)
        LOGGER.debug("The job id is {}".format(self.job_id))
        federation.init(self.job_id, runtime_json)
        LOGGER.debug("Finish eggroll and federation init")
        self._init_pipeline()
Пример #4
0
 def _init_argument(self):
     self._init_LOGGER(LOGGER_path)
     self._initialize(config_path)
     with open(config_path) as conf_f:
         runtime_json = json.load(conf_f)
     eggroll.init(job_id)
     federation.init(job_id, runtime_json)
Пример #5
0
def do_export_file(job_id, _data):
    try:
        work_mode = _data.get("work_mode")
        name = _data.get("name")
        namespace = _data.get("namespace")
        delimitor = _data.get("delimitor", ",")
        output_path = _data.get("output_path")

        eggroll.init(job_id, work_mode)

        with open(os.path.abspath(output_path), "w") as fout:
            data_table = storage.get_data_table(name=name, namespace=namespace)
               
            print('===== begin to export data =====')
            lines = 0

            for key, value in data_table.collect():
                if not value:
                    fout.write(key + "\n")
                else:
                    fout.write(key + delimitor + value + "\n")
                
                lines += 1
                if lines % 2000 == 0:
                    print("===== export {} lines =====".format(lines))

            print("===== export {} lines totally =====".format(lines))
            print('===== export data finish =====')
    except:
        raise ValueError("cannot export data, please check json file")
Пример #6
0
    def setUp(self):
        eggroll.init("test_encrypt_mode_calculator")

        self.list_data = []
        self.tuple_data = []
        self.numpy_data = []

        for i in range(30):
            list_value = [100 * i + j for j in range(20)]
            tuple_value = tuple(list_value)
            numpy_value = np.array(list_value, dtype="int")

            self.list_data.append(list_value)
            self.tuple_data.append(tuple_value)
            self.numpy_data.append(numpy_value)

        self.data_list = eggroll.parallelize(self.list_data,
                                             include_key=False,
                                             partition=10)
        self.data_tuple = eggroll.parallelize(self.tuple_data,
                                              include_key=False,
                                              partition=10)
        self.data_numpy = eggroll.parallelize(self.numpy_data,
                                              include_key=False,
                                              partition=10)
Пример #7
0
def data_to_eggroll_table(data, namespace, table_name,partition=1, work_mode=0):
    eggroll.init(mode=work_mode)
    data_table = eggroll.table(table_name, namespace, partition=partition, create_if_missing=True, error_if_exist=False)
    data_table.put_all(data)
    print("------------load data finish!-----------------")
    print("total data_count:"+str(data_table.count()))
    print("namespace:%s, table_name:%s" %(namespace, table_name))
Пример #8
0
def get_cross_size(gpid, guid, g_table, hpid, huid, h_table):
    job_id = gen_job_id(hpid, gpid)
    roles = {consts.HOST: [hpid], consts.GUEST: [gpid]}
    args = ((huid, consts.HOST, job_id, h_table, roles),
            (guid, consts.GUEST, job_id, g_table, roles))
    # ps = [Process(target=role_jobs, args=arg) for arg in args]
    # for p in ps:
    #     p.start()
    # for p in ps:
    #     p.join()
    # print('processes done')

    with concurrent.futures.ProcessPoolExecutor(max_workers=2) as executor:
        futures = {executor.submit(role_jobs, *arg): arg[1] for arg in args}
        for future in concurrent.futures.as_completed(futures):
            try:
                future.result()
            except Exception as exc:
                print(str(exc))

    intersect_table = consts.GUEST + '_intersect_output_' + job_id
    intersect_namespace = 'fdp_output_namespace'
    eggroll.init(job_id, WORK_MODE)
    table = eggroll.table(
        name=intersect_table,
        namespace=intersect_namespace,
    )
    table_size = table.count()
    return table_size
Пример #9
0
 def setUp(self):
     eggroll.init("test_random_sampler")
     self.data = [(i * 10 + 5, i * i) for i in range(100)]
     self.table = eggroll.parallelize(self.data, include_key=True)
     self.data_to_trans = [(i * 10 + 5, i * i * i) for i in range(100)]
     self.table_trans = eggroll.parallelize(self.data_to_trans,
                                            include_key=True)
Пример #10
0
 def setUp(self):
     # use default setting
     eggroll.init("123")
     logistic_param = LogisticParam()
     self.model = BaseLogisticRegression(logistic_param)
     self.model.header = []
     self.data_instance = self.__prepare_data()
Пример #11
0
def test_plain_lr():
    from sklearn.datasets import make_moons
    import functools
    # 修改flow_id 否则内存表可能被覆盖
    eggroll.init(mode=0)
    ns = str(uuid.uuid1())

    X = eggroll.table('testX7', ns, partition=2)
    Y = eggroll.table('testY7', ns, partition=2)

    b = np.array([0])
    eta = 1.2
    max_iter = 10

    total_num = 500

    _x, _y = make_moons(total_num, noise=0.25, random_state=12345)
    for i in range(np.shape(_y)[0]):
        X.put(i, _x[i])
        Y.put(i, _y[i])

    print(len([y for y in Y.collect()]))

    current_milli_time = lambda: int(round(time.time() * 1000))

    start = current_milli_time()
    #shape_w = [1, np.shape(_x)[1]]
    shape_w = [np.shape(_x)[1]]
    w = np.ones(shape_w)

    print(w)
    X = TensorInEgg(None, None, X)
    Y = TensorInEgg(None, None, Y)
    w = TensorInPy(None, None, w)
    b = TensorInPy(None, None, b)

    # lr = LR(shape_w)
    # lr.train(X, Y)
    itr = 0
    while itr < max_iter:
        H = 1 / X
        H = 1.0 / (1 + ((X @ w + b) * -1).map(np.exp))
        R = H - Y

        gradient_w = (R * X).sum() / total_num
        gradient_b = R.sum() / total_num
        w = w - eta * gradient_w
        b = b - eta * gradient_b
        print("aaa", w, b)
        # self.plot(itr)
        itr += 1

    print("train total time: {}".format(current_milli_time() - start))
    _x_test, _y_test = make_moons(50, random_state=12345)
    _x_test = TensorInPy(None, None, _x_test)
    y_pred = 1.0 / (1 + ((_x_test @ w + b) * -1).map(np.exp))
    from sklearn import metrics

    auc = metrics.roc_auc_score(_y_test, y_pred.store.reshape(50))
    print("auc: {}".format(auc))
Пример #12
0
    def setUp(self):
        eggroll.init("test_instance")
        dense_inst = []
        dense_not_inst = []
        headers = ['x' + str(i) for i in range(20)]
        self.header = headers
        self.eps = 1e-5
        self.count = 100
        self.dense_data_transpose = []
        for i in range(self.count):
            features = i % 16 * np.ones(20)
            inst = Instance(features=features)
            dense_inst.append((i, inst))
            self.dense_data_transpose.append(features)
            dense_not_inst.append((i, features))
        self.dense_inst = dense_inst
        self.dense_not_inst = dense_not_inst
        self.dense_data_transpose = np.array(self.dense_data_transpose)
        self.dense_data_transpose = self.dense_data_transpose.transpose()

        self.dense_table = eggroll.parallelize(dense_inst, include_key=True, partition=5)
        self.dense_not_inst_table = eggroll.parallelize(dense_not_inst, include_key=True, partition=5)
        self.dense_table.schema = {'header': headers}
        self.dense_not_inst_table.schema = {'header': headers}

        col_index = [1, 2, 3]
        self.col_index = col_index
        self.summary_obj = MultivariateStatisticalSummary(self.dense_table, col_index, abnormal_list=[None])
        self.summary_obj_not_inst = MultivariateStatisticalSummary(self.dense_not_inst_table, col_index,
                                                                   abnormal_list=[None])
Пример #13
0
    def setUp(self):
        self.feature_histogram = FeatureHistogram()
        eggroll.init("test_feature_histogram")
        data_insts = []
        for i in range(1000):
            indices = []
            data = []
            for j in range(10):
                x = random.randint(0, 5)
                if x != 0:
                    data.append(x)
                    indices.append(j)
            sparse_vec = SparseVector(indices, data, shape=10)
            data_insts.append((Instance(features=sparse_vec), (1, random.randint(0, 3))))
        self.node_map = {0: 0, 1: 1, 2: 2, 3: 3}
        self.data_insts = data_insts
        self.data_bin = eggroll.parallelize(data_insts, include_key=False)

        self.grad_and_hess_list = [(random.random(), random.random()) for i in range(1000)]
        self.grad_and_hess = eggroll.parallelize(self.grad_and_hess_list, include_key=False)

        bin_split_points = []
        for i in range(10):
            bin_split_points.append(np.array([i for i in range(5)]))
        self.bin_split_points = np.array(bin_split_points)
        self.bin_sparse = [0 for i in range(10)]
Пример #14
0
def import_offline_feature():
    eggroll.init(job_id=generate_job_id(), mode=WORK_MODE)
    request_data = request.json
    try:
        if not request_data.get("jobId"):
            return get_json_result(status=2, msg="no job id")
        job_id = request_data.get("jobId")
        job_data = query_job_by_id(job_id=job_id)
        if not job_data:
            return get_json_result(status=3,
                                   msg="can not found this job id: %s" %
                                   request_data.get("jobId", ""))
        response = GetFeature.import_data(request_data,
                                          json.loads(job_data[0]["config"]))
        if response.get("status", 1) == 0:
            update_job_by_id(job_id=job_id,
                             update_data={
                                 "status": "success",
                                 "end_date": datetime.datetime.now()
                             })
            return get_json_result()
        else:
            return get_json_result(status=1,
                                   msg="request offline feature error: %s" %
                                   response.get("msg", ""))
    except Exception as e:
        logger.exception(e)
        return get_json_result(status=1,
                               msg="request offline feature error: %s" % e)
Пример #15
0
 def setUp(self):
     eggroll.init("test_least_abs_error_loss")
     self.lae_loss = LeastAbsoluteErrorLoss()
     self.y_list = [i % 2 for i in range(100)]
     self.predict_list = [random.random() for i in range(100)]
     self.y = eggroll.parallelize(self.y_list, include_key=False)
     self.predict = eggroll.parallelize(self.predict_list,
                                        include_key=False)
Пример #16
0
 def setUp(self):
     eggroll.init("test_fair_loss")
     self.log_cosh_loss = LogCoshLoss()
     self.y_list = [i % 2 for i in range(100)]
     self.predict_list = [random.random() for i in range(100)]
     self.y = eggroll.parallelize(self.y_list, include_key=False)
     self.predict = eggroll.parallelize(self.predict_list,
                                        include_key=False)
Пример #17
0
 def setUp(self):
     eggroll.init("test_cross_entropy")
     self.sigmoid_loss = SigmoidBinaryCrossEntropyLoss()
     self.y_list = [i % 2 for i in range(100)]
     self.predict_list = [random.random() for i in range(100)]
     self.y = eggroll.parallelize(self.y_list, include_key=False)
     self.predict = eggroll.parallelize(self.predict_list,
                                        include_key=False)
Пример #18
0
 def setUp(self):
     eggroll.init("test_huber_loss")
     self.delta = 1
     self.huber_loss = HuberLoss(self.delta)
     self.y_list = [i % 2 for i in range(100)]
     self.predict_list = [random.random() for i in range(100)]
     self.y = eggroll.parallelize(self.y_list, include_key=False)
     self.predict = eggroll.parallelize(self.predict_list,
                                        include_key=False)
Пример #19
0
 def setUp(self):
     eggroll.init("test_fair_loss")
     self.rho = 0.5
     self.tweedie_loss = TweedieLoss(self.rho)
     self.y_list = [i % 2 for i in range(100)]
     self.predict_list = [random.random() for i in range(100)]
     self.y = eggroll.parallelize(self.y_list, include_key=False)
     self.predict = eggroll.parallelize(self.predict_list,
                                        include_key=False)
Пример #20
0
    def setUp(self):
        eggroll.init("test_label_checker")

        self.small_label_set = [i % 5 for i in range(100)]
        self.classify_y = eggroll.parallelize(self.small_label_set, include_key=False)
        self.regression_label = [random.random() for i in range(100)]
        self.regression_y = eggroll.parallelize(self.regression_label)
        self.classify_checker = ClassifyLabelChecker()
        self.regression_checker = RegressionLabelChecker()
Пример #21
0
def predict(gid, g_table, gy_id, hid, h_table, hy_id, model_name):
    job_id = gen_job_id(hid, gid)
    run_jobs(gid, g_table, gy_id, hid, h_table, hy_id, job_id, 'predict', model_name)
    eggroll.init(job_id, WORK_MODE)
    predict_output = consts.GUEST + '_predict_table_' + job_id
    table = eggroll.table(predict_output, PREDICT_NAMESPACE)
    result = list(table.collect())
    items = [(a, b, c) for a, (b, c, d) in result]
    return pd.DataFrame.from_records(items, columns=['id', 'label', 'prob'])
Пример #22
0
def import_id():
    eggroll.init(job_id=generate_job_id(), mode=WORK_MODE)
    request_data = request.json
    table_name_space = "id_library"
    try:
        id_library_info = eggroll.table("info",
                                        table_name_space,
                                        partition=10,
                                        create_if_missing=True,
                                        error_if_exist=False)
        if request_data.request("rangeStart") == 0:
            data_id = generate_job_id()
            id_library_info.put("tmp_data_id", data_id)
        else:
            data_id = id_library_info.request("tmp_data_id")
        data_table = eggroll.table(data_id,
                                   table_name_space,
                                   partition=50,
                                   create_if_missing=True,
                                   error_if_exist=False)
        for i in request_data.request("ids", []):
            data_table.put(i, "")
        if request_data.request("rangeEnd") and request_data.request(
                "total") and (request_data.request("total") -
                              request_data.request("rangeEnd") == 1):
            # end
            new_id_count = data_table.count()
            if new_id_count == request_data["total"]:
                id_library_info.put(
                    data_id,
                    json.dumps({
                        "salt": request_data.request("salt"),
                        "saltMethod": request_data.request("saltMethod")
                    }))
                old_data_id = id_library_info.request("use_data_id")
                id_library_info.put("use_data_id", data_id)
                logger.info(
                    "import id success, dtable name is {}, namespace is {}",
                    data_id, table_name_space)

                # TODO: destroy DTable, should be use a lock
                old_data_table = eggroll.table(old_data_id,
                                               table_name_space,
                                               partition=50,
                                               create_if_missing=True,
                                               error_if_exist=False)
                old_data_table.destroy()
                id_library_info.delete(old_data_id)
            else:
                data_table.destroy()
                return get_json_result(
                    2, "the actual amount of data is not equal to total.")
        return get_json_result()
    except Exception as e:
        logger.exception(e)
        return get_json_result(1, "import error.")
Пример #23
0
    def setUp(self):
        eggroll.init("test_stratified_sampler")
        self.data = []
        self.data_to_trans = []
        for i in range(1000):
            self.data.append((i, Instance(label=i % 4, features=i * i)))
            self.data_to_trans.append((i, Instance(features = i ** 3)))

        self.table = eggroll.parallelize(self.data, include_key=True)
        self.table_trans = eggroll.parallelize(self.data_to_trans, include_key=True)
Пример #24
0
 def setUp(self):
     eggroll.init("test_cross_entropy")
     self.softmax_loss = SoftmaxCrossEntropyLoss()
     self.y_list = [i % 5 for i in range(100)]
     self.predict_list = [
         np.array([random.random() for i in range(5)]) for j in range(100)
     ]
     self.y = eggroll.parallelize(self.y_list, include_key=False)
     self.predict = eggroll.parallelize(self.predict_list,
                                        include_key=False)
Пример #25
0
def query_model_version_history():
    request_data = request.json
    try:
        config = file_utils.load_json_conf(request_data.get("config_path"))
        eggroll.init(mode=WORK_MODE)
        history = version_history(data_table_namespace=config.get("namespace"))
        return get_json_result(msg=json.dumps(history))
    except Exception as e:
        logger.exception(e)
        return get_json_result(status=1, msg="load model error: %s" % e)
Пример #26
0
    def _init_argument(self):
        self._initialize(config_path)
        with open(config_path) as conf_f:
            runtime_json = json.load(conf_f)

        LOGGER.debug("The Guest job id is {}".format(job_id))
        LOGGER.debug("The Guest work mode id is {}".format(self.workflow_param.work_mode))
        eggroll.init(job_id, self.workflow_param.work_mode)
        federation.init(job_id, runtime_json)
        LOGGER.debug("Finish eggroll and federation init")
Пример #27
0
    def setUp(self):
        eggroll.init("test_dataio" + str(int(time.time())))
        self.table = "dataio_table_test"
        self.namespace = "dataio_test"
        table = eggroll.parallelize([("a", "1,2,-1,0,0,5"), ("b", "4,5,6,0,1,2")], include_key=True)
        table.save_as(self.table, self.namespace)

        self.table2 = "dataio_table_test2"
        self.namespace2 = "dataio_test2"
        table2 = eggroll.parallelize([("a", '-1,,NA,NULL,null,2')], include_key=True)
        table2.save_as(self.table2, self.namespace2)
Пример #28
0
def init(job_id, runtime_conf, mode, server_conf_path="arch/conf/server_conf.json"):
    eggroll.init(job_id, mode)
    print("runtime_conf:{}".format(runtime_conf))
    all_checker = AllChecker(runtime_conf)
    all_checker.check_all()
    with open(runtime_conf) as conf_p:
        runtime_json = json.load(conf_p)

    if mode is None:
        raise EnvironmentError("eggroll should be initialized before fate_script")
    if mode == WorkMode.STANDALONE:
        RuntimeInstance.FEDERATION = standalone_fate_script.init(job_id=job_id, runtime_conf=runtime_json)
    else:
        RuntimeInstance.FEDERATION = cluster_fate_script.init(job_id=job_id, runtime_conf=runtime_json, server_conf_path=server_conf_path)
Пример #29
0
 def setUp(self):
     eggroll.init("123")
     self.data_num = 1000
     self.feature_num = 200
     final_result = []
     for i in range(self.data_num):
         tmp = i * np.ones(self.feature_num)
         inst = Instance(inst_id=i, features=tmp, label=0)
         tmp = (str(i), inst)
         final_result.append(tmp)
     table = eggroll.parallelize(final_result,
                                 include_key=True,
                                 partition=3)
     self.table = table
Пример #30
0
    def setUp(self):
        # for test_aggregate_add
        eggroll.init("test_hetero_federated_aggregator")
        self.size = 10
        self.table_a = eggroll.parallelize(range(self.size))
        self.table_b = eggroll.parallelize(list(range(self.size)))
        self.add_a_b = [i * 2 for i in range(self.size)]

        # for test_aggregate_mean
        self.table_d_tuple = eggroll.parallelize([(i, i + 1)
                                                  for i in range(self.size)])
        self.reduce_a = np.sum(list(range(self.size))) / self.size * 1.0
        self.reduce_d_tuple = [
            np.sum(list(range(self.size))) / self.size * 1.0,
            np.sum(list(range(self.size + 1))) / self.size * 1.0
        ]

        # for test_separate
        self.separate_data = list(range(self.size))
        self.separate_size_list = [
            int(0.1 * self.size),
            int(0.2 * self.size),
            int(0.3 * self.size),
            int(0.4 * self.size)
        ]
        self.separate_result = []
        cur_index = 0
        for i in range(len(self.separate_size_list)):
            self.separate_result.append(
                self.separate_data[cur_index:cur_index +
                                   self.separate_size_list[i]])
            cur_index += self.separate_size_list[i]

        # for test_aggregate_add_square
        this_size = 10000
        list_a = [random.randint(0, 1000) for _ in range(this_size)]
        list_b = [random.randint(0, 1000) for _ in range(this_size)]
        self.table_list_a = eggroll.parallelize(list_a)
        self.table_list_b = eggroll.parallelize(list_b)
        self.table_list_a_square = eggroll.parallelize(
            [np.square(i) for i in list_a])
        self.table_list_b_square = eggroll.parallelize(
            [np.square(i) for i in list_b])

        self.list_add_square_result = list(
            np.sort(
                np.array([np.square(i + j)
                          for (i, j) in zip(list_a, list_b)])))