示例#1
0
def test_plain_lr():
    from sklearn.datasets import make_moons
    import functools
    # 修改flow_id 否则内存表可能被覆盖
    eggroll.init(mode=0)
    ns = str(uuid.uuid1())

    X = eggroll.table('testX7', ns, partition=2)
    Y = eggroll.table('testY7', ns, partition=2)

    b = np.array([0])
    eta = 1.2
    max_iter = 10

    total_num = 500

    _x, _y = make_moons(total_num, noise=0.25, random_state=12345)
    for i in range(np.shape(_y)[0]):
        X.put(i, _x[i])
        Y.put(i, _y[i])

    print(len([y for y in Y.collect()]))

    current_milli_time = lambda: int(round(time.time() * 1000))

    start = current_milli_time()
    #shape_w = [1, np.shape(_x)[1]]
    shape_w = [np.shape(_x)[1]]
    w = np.ones(shape_w)

    print(w)
    X = TensorInEgg(None, None, X)
    Y = TensorInEgg(None, None, Y)
    w = TensorInPy(None, None, w)
    b = TensorInPy(None, None, b)

    # lr = LR(shape_w)
    # lr.train(X, Y)
    itr = 0
    while itr < max_iter:
        H = 1 / X
        H = 1.0 / (1 + ((X @ w + b) * -1).map(np.exp))
        R = H - Y

        gradient_w = (R * X).sum() / total_num
        gradient_b = R.sum() / total_num
        w = w - eta * gradient_w
        b = b - eta * gradient_b
        print("aaa", w, b)
        # self.plot(itr)
        itr += 1

    print("train total time: {}".format(current_milli_time() - start))
    _x_test, _y_test = make_moons(50, random_state=12345)
    _x_test = TensorInPy(None, None, _x_test)
    y_pred = 1.0 / (1 + ((_x_test @ w + b) * -1).map(np.exp))
    from sklearn import metrics

    auc = metrics.roc_auc_score(_y_test, y_pred.store.reshape(50))
    print("auc: {}".format(auc))
示例#2
0
def split_into_guest_host_dtable(X, y, overlap_ratio=0.2, guest_split_ratio=0.5, guest_feature_num=16,
                                 tables_name=None, partition=1):
    data_size = X.shape[0]
    overlap_size = int(data_size * overlap_ratio)
    overlap_indexes = np.array(range(overlap_size))
    guest_size = int((data_size - overlap_size) * guest_split_ratio)

    guest_table_ns = "guest_table_ns"
    guest_table_name = "guest_table_name"
    host_table_ns = "host_table_ns"
    host_table_name = "host_table_name"
    if tables_name is not None:
        guest_table_ns = tables_name["guest_table_ns"]
        guest_table_name = tables_name["guest_table_name"]
        host_table_ns = tables_name["host_table_ns"]
        host_table_name = tables_name["host_table_name"]

    guest_temp = []
    for i in range(0, overlap_size + guest_size):
        guest_temp.append(
            (i, Instance(inst_id=None, weight=1.0, features=X[i, :guest_feature_num].reshape(1, -1), label=y[i, 0])))
    guest_data = table(name=guest_table_name, namespace=guest_table_ns, partition=partition)
    guest_data.put_all(guest_temp)

    host_temp = []
    for i in range(0, overlap_size):
        host_temp.append(
            (i, Instance(inst_id=None, weight=1.0, features=X[i, guest_feature_num:].reshape(1, -1), label=y[i, 0])))
    for i in range(overlap_size + guest_size, len(X)):
        host_temp.append(
            (i, Instance(inst_id=None, weight=1.0, features=X[i, guest_feature_num:].reshape(1, -1), label=y[i, 0])))
    host_data = table(name=host_table_name, namespace=host_table_ns, partition=partition)
    host_data.put_all(host_temp)
    return guest_data, host_data, overlap_indexes
示例#3
0
    def test_read_guest_host_eggroll_table(self):

        X = np.random.rand(30, 3)
        y = np.random.rand(30, 1)
        overlap_ratio = 0.2
        guest_split_ratio = 0.5
        guest_feature_num = 16

        tables_name = {}
        tables_name["guest_table_ns"] = "guest_table_ns_01"
        tables_name["guest_table_name"] = "guest_table_name_01"
        tables_name["host_table_ns"] = "host_table_ns_01"
        tables_name["host_table_name"] = "host_table_name_01"

        guest_data, host_data, overlap_indexes = split_into_guest_host_dtable(X, y, overlap_ratio=overlap_ratio,
                                                                              guest_split_ratio=guest_split_ratio,
                                                                              guest_feature_num=guest_feature_num,
                                                                              tables_name=tables_name)

        expected_guest_size = guest_data.count()
        expected_host_size = host_data.count()

        actual_guest_table = table(tables_name["guest_table_name"], tables_name["guest_table_ns"])
        actual_host_table = table(tables_name["host_table_name"], tables_name["host_table_ns"])

        actual_guest_size = actual_guest_table.count()
        actual_host_size = actual_host_table.count()
        assert expected_guest_size == actual_guest_size
        assert expected_host_size == actual_host_size
示例#4
0
def import_id():
    eggroll.init(job_id=generate_job_id(), mode=WORK_MODE)
    request_data = request.json
    table_name_space = "id_library"
    try:
        id_library_info = eggroll.table("info",
                                        table_name_space,
                                        partition=10,
                                        create_if_missing=True,
                                        error_if_exist=False)
        if request_data.request("rangeStart") == 0:
            data_id = generate_job_id()
            id_library_info.put("tmp_data_id", data_id)
        else:
            data_id = id_library_info.request("tmp_data_id")
        data_table = eggroll.table(data_id,
                                   table_name_space,
                                   partition=50,
                                   create_if_missing=True,
                                   error_if_exist=False)
        for i in request_data.request("ids", []):
            data_table.put(i, "")
        if request_data.request("rangeEnd") and request_data.request(
                "total") and (request_data.request("total") -
                              request_data.request("rangeEnd") == 1):
            # end
            new_id_count = data_table.count()
            if new_id_count == request_data["total"]:
                id_library_info.put(
                    data_id,
                    json.dumps({
                        "salt": request_data.request("salt"),
                        "saltMethod": request_data.request("saltMethod")
                    }))
                old_data_id = id_library_info.request("use_data_id")
                id_library_info.put("use_data_id", data_id)
                logger.info(
                    "import id success, dtable name is {}, namespace is {}",
                    data_id, table_name_space)

                # TODO: destroy DTable, should be use a lock
                old_data_table = eggroll.table(old_data_id,
                                               table_name_space,
                                               partition=50,
                                               create_if_missing=True,
                                               error_if_exist=False)
                old_data_table.destroy()
                id_library_info.delete(old_data_id)
            else:
                data_table.destroy()
                return get_json_result(
                    2, "the actual amount of data is not equal to total.")
        return get_json_result()
    except Exception as e:
        logger.exception(e)
        return get_json_result(1, "import error.")
示例#5
0
    def test_destroy_table(self):

        row_count = 10
        expect_data = np.random.rand(row_count, 10)

        table_name = "table_name"
        table_ns = "table_ns"
        dtable = create_table(expect_data, model_table_name=table_name, model_namespace=table_ns, persistent=True)
        dtable_2 = table(name=table_name, namespace=table_ns)
        assert dtable.count() == dtable_2.count()

        dtable_2.destroy()
        dtable_3 = table(name=table_name, namespace=table_ns)
        assert dtable_3.count() == 0
示例#6
0
def show_embedding():
    host_embedding = eggroll.table('host', 'node_embedding', persistent=True)
    guest_embedding = eggroll.table('guest', 'node_embedding', persistent=True)
    print(guest_embedding.count())
    common_nodes = eggroll.table("common_nodes",
                                 "common_nodes",
                                 persistent=True)
    common_nodes = common_nodes.take(common_nodes.count(), keysOnly=True)
    for node in common_nodes[0:5]:
        node = int(node)
        sim = cos_sim(host_embedding.get(node), guest_embedding.get(node))
        print("node: {}, sim: {}".format(node, sim))

    print(cos_sim(host_embedding.get(8), guest_embedding.get(12)))
    """
示例#7
0
    def gen_data_instance(self, table_name, namespace):
        data_model = self._get_data_model_param()
        if data_model.is_read_table:
            return eggroll.table(table_name, namespace)
        else:

            file_path = data_model.file_path
            overlap_ratio = data_model.overlap_ratio
            guest_split_ratio = data_model.guest_split_ratio
            guest_feature_num = data_model.n_feature_guest
            num_samples = data_model.num_samples
            balanced = data_model.balanced

            namespace, table_name = generate_table_namespace_n_name(file_path)
            suffix = "_" + str(uuid.uuid1())
            tables_name = {
                "guest_table_ns": "guest_" + namespace + suffix,
                "guest_table_name": "guest_" + table_name + suffix,
                "host_table_ns": "host_" + namespace + suffix,
                "host_table_name": "host_" + table_name + suffix,
            }

            guest_data, host_data = load_guest_host_dtable_from_UCI_Credit_Card(
                file_path=file_path,
                num_samples=num_samples,
                tables_name=tables_name,
                overlap_ratio=overlap_ratio,
                guest_split_ratio=guest_split_ratio,
                guest_feature_num=guest_feature_num,
                balanced=balanced)
            return guest_data
示例#8
0
def data_to_eggroll_table(data, namespace, table_name,partition=1, work_mode=0):
    eggroll.init(mode=work_mode)
    data_table = eggroll.table(table_name, namespace, partition=partition, create_if_missing=True, error_if_exist=False)
    data_table.put_all(data)
    print("------------load data finish!-----------------")
    print("total data_count:"+str(data_table.count()))
    print("namespace:%s, table_name:%s" %(namespace, table_name))
示例#9
0
def get_table_count(name, namespace):
    from arch.api import eggroll
    eggroll.init("get_intersect_output", mode=1)
    table = eggroll.table(name, namespace)
    count = table.count()
    print("table count:{}".format(count))
    return count
示例#10
0
    def _distributed_negative_sampling_dst(self, adj_instances, src=consts.HOST, dst=consts.GUEST):
        if src == consts.HOST:
            if dst != consts.GUEST:
                raise NameError("if src is host, then dst should be guest!!!")
            nega_ids_transfer = self.transfer_variable.host_neg_samp_ids
        elif src == consts.GUEST:
            if dst != consts.HOST:
                raise NameError("if src is guest, then dst should be host!!!")
            nega_ids_transfer = self.transfer_variable.guest_neg_samp_ids
        else:
            raise NameError("src should be choose from {host, guest}")

        distributed_negative_ids = federation.get(name=nega_ids_transfer.name,
                                                  tag=self.transfer_variable.generate_transferid(nega_ids_transfer),
                                                  idx=0)
        LOGGER.info("Get distributed nagative samples from {}".format(src))
        for i in range(10):
            LOGGER.info("id:{}".format(distributed_negative_ids[i]))

        #sample some negative samples
        distribution = NeighborsSampling.generate_nega_distribution(adj_instances)
        sampler = DiscreteDistributionSampler([data[1] for data in distribution])

        distributed_negative_instances_dst = eggroll.table(name=dst + eggroll.generateUniqueId(),
                                                           namespace='neighbors_sampling/distributed_sampling',
                                                           persistent=False)
                                                           
        for id in distributed_negative_ids:
            index = sampler.sampling()
            distributed_negative_instances_dst.put(id, (distribution[index][0], -1))
        
        logDtableInstances(LOGGER, distributed_negative_instances_dst, isInstance=False)

        return distributed_negative_instances_dst
示例#11
0
def feed_into_dtable(ids, X, y, sample_range, feature_range, tables_name=None, partition=1):
    """
    Create an eggroll table feed with data specified by parameters provided

    parameters
    ----------
    :param ids: 1D numpy array
    :param X: 2D numpy array
    :param y: 2D numpy array
    :param sample_range: a tuple specifies the range of samples to feed into dtable
    :param feature_range: a tuple specifies the range of features to feed into dtable
    :param tables_name: a dictionary specifies table namespace (with key table_ns) and table name (with key table_name)
    :param partition: number of partition used when creating the dtable
    :return: an eggroll dtable
    """

    table_ns = "default_table_namespace"
    table_name = get_timestamp()
    if tables_name is not None:
        table_ns = tables_name["table_ns"]
        table_name = tables_name["table_name"]

    sample_list = []
    for i in range(sample_range[0], sample_range[1]):
        sample_list.append((ids[i], Instance(inst_id=ids[i],
                                             features=X[i, feature_range[0]:feature_range[1]],
                                             label=y[i, 0])))
    data_table = table(name=table_name, namespace=table_ns, partition=partition)
    data_table.put_all(sample_list)
    return data_table
示例#12
0
def save_data(kv_data: Iterable,
              name,
              namespace,
              partition=1,
              create_if_missing=True,
              error_if_exist=False,
              version_log=None):
    """
    save data into data table
    :param kv_data:
    :param name: table name of data table
    :param namespace: table namespace of data table
    :param partition: number of partition
    :param create_if_missing:
    :param error_if_exist:
    :return:
        data table instance
    """
    data_table = eggroll.table(name=name,
                               namespace=namespace,
                               partition=partition,
                               create_if_missing=create_if_missing,
                               error_if_exist=error_if_exist)
    data_table.put_all(kv_data)
    version_log = "[AUTO] save data at %s." % datetime.datetime.now(
    ) if not version_log else version_log
    control.save_version(name=name,
                         namespace=namespace,
                         version_log=version_log)
    return data_table
示例#13
0
def table(name: str, namespace: str, partition: int = 1, persistent: bool = True, create_if_missing: bool = True,
          error_if_exist: bool = False,
          in_place_computing: bool = False):
    data_table = eggroll.table(name=name, namespace=namespace, partition=partition, persistent=persistent,
                               create_if_missing=create_if_missing, error_if_exist=error_if_exist,
                               in_place_computing=in_place_computing)
    return data_table
示例#14
0
def show_distributed_samples(topk):
    samples_anchor = eggroll.table(
        'anchor',
        "neighbors_samples/distributed_samples/host",
        persistent=True)
    samples_target = eggroll.table(
        'target',
        "neighbors_samples/distributed_samples/guest",
        persistent=True)

    samples_anchor = list(samples_anchor.collect())
    samples_target = list(samples_target.collect())

    for anchor, target in zip(samples_anchor[:topk + 10], samples_target):
        print("sample_id: {}, anchor:{}   sample_id: {}, target:{}".format(
            anchor[0], anchor[1], target[0], target[1]))
示例#15
0
def save_model(buffer_type, proto_buffer, name, namespace, version_log=None):
    data_table = eggroll.table(name=name, namespace=namespace, partition=get_model_table_partition_count(),
                               create_if_missing=True, error_if_exist=False)
    # todo:  model slice?
    data_table.put(buffer_type, proto_buffer.SerializeToString(), use_serialize=False)
    version_log = "[AUTO] save model at %s." % datetime.datetime.now() if not version_log else version_log
    version_control.save_version(name=name, namespace=namespace, version_log=version_log)
示例#16
0
 def load_eval_result(self):
     eval_data = eggroll.table(
         name=self.workflow_param.evaluation_output_table,
         namespace=self.workflow_param.evaluation_output_namespace,
     )
     LOGGER.debug("Evaluate result loaded: {}".format(eval_data))
     return eval_data
示例#17
0
def get_cross_size(gpid, guid, g_table, hpid, huid, h_table):
    job_id = gen_job_id(hpid, gpid)
    roles = {consts.HOST: [hpid], consts.GUEST: [gpid]}
    args = ((huid, consts.HOST, job_id, h_table, roles),
            (guid, consts.GUEST, job_id, g_table, roles))
    # ps = [Process(target=role_jobs, args=arg) for arg in args]
    # for p in ps:
    #     p.start()
    # for p in ps:
    #     p.join()
    # print('processes done')

    with concurrent.futures.ProcessPoolExecutor(max_workers=2) as executor:
        futures = {executor.submit(role_jobs, *arg): arg[1] for arg in args}
        for future in concurrent.futures.as_completed(futures):
            try:
                future.result()
            except Exception as exc:
                print(str(exc))

    intersect_table = consts.GUEST + '_intersect_output_' + job_id
    intersect_namespace = 'fdp_output_namespace'
    eggroll.init(job_id, WORK_MODE)
    table = eggroll.table(
        name=intersect_table,
        namespace=intersect_namespace,
    )
    table_size = table.count()
    return table_size
def get_commit_tmp_table(data_table_namespace):
    version_tmp_table = eggroll.table(name=data_table_namespace,
                                      namespace="version_tmp",
                                      partition=1,
                                      create_if_missing=True,
                                      error_if_exist=False)
    return version_tmp_table
def get_id_library_table_name():
    id_library_info = eggroll.table('info',
                                    'id_library',
                                    partition=10,
                                    create_if_missing=True,
                                    error_if_exist=False)
    return id_library_info.get("use_data_id")
示例#20
0
def show_local_samples(name, namespace, topk=5):
    local_samples = eggroll.table(name, namespace, persistent=True)
    samples = list(local_samples.collect())
    for data in samples[:topk]:
        print("sample_id: {}, training pairs:{}".format(data[0], data[1]))

    for data in samples[-topk:]:
        print("sample_id: {}, training pairs:{}".format(data[0], data[1]))
 def load_model(self, model_table, model_namespace):
     LOGGER.info("load model")
     modelmeta = list(
         eggroll.table(model_table, model_namespace).collect())[0][1]
     self.task_type = modelmeta.task_type
     self.loss_type = modelmeta.loss_type
     self.tree_dim = modelmeta.tree_dim
     self.trees_ = modelmeta.trees_
示例#22
0
def save_data_to_eggroll_table(data, namespace, table_name, partition=1):
    data_table = table(table_name,
                       namespace,
                       partition=partition,
                       create_if_missing=True,
                       error_if_exist=True)
    data_table.put_all(data)
    return data_table
示例#23
0
def predict(gid, g_table, gy_id, hid, h_table, hy_id, model_name):
    job_id = gen_job_id(hid, gid)
    run_jobs(gid, g_table, gy_id, hid, h_table, hy_id, job_id, 'predict', model_name)
    eggroll.init(job_id, WORK_MODE)
    predict_output = consts.GUEST + '_predict_table_' + job_id
    table = eggroll.table(predict_output, PREDICT_NAMESPACE)
    result = list(table.collect())
    items = [(a, b, c) for a, (b, c, d) in result]
    return pd.DataFrame.from_records(items, columns=['id', 'label', 'prob'])
示例#24
0
def get_data_table(name, namespace):
    """
    return data table instance by table name and table name space
    :param name: table name of data table
    :param namespace: table name space of data table
    :return:
        data table instance
    """
    return eggroll.table(name=name, namespace=namespace, create_if_missing=False)
示例#25
0
def get_lr_y_table(file_path):
    ns = str(uuid.uuid1())
    csv_table = pd.read_csv(file_path)
    data = pd.read_csv(file_path).values
    y = eggroll.table('fata_script_test_data_y_' + str(RuntimeInstance.FEDERATION.role) + str(RuntimeInstance.FEDERATION.job_id), ns, partition=2, persistent=True)
    if 'y' not in list(csv_table.columns.values):
        raise RuntimeError("input data must contain y column")
    for i in range(np.shape(data)[0]):
        y.put(data[i][0], 1 if data[i][1] == 1 else -1)
    return TensorInEgg(RuntimeInstance.FEDERATION.encrypt_operator, None, y)
示例#26
0
def show_result(table, namespace, rows=10):
    result = eggroll.table(table, namespace)
    print('data count: {}'.format(result.count()))

    if result.count() > 10:
        result_data = result.collect()
        n = 0
        while n < rows:
            result = result_data.__next__()
            print("predict result: {}".format(result[1].features))
            n += 1
示例#27
0
def get_lr_x_table(file_path):
    ns = str(uuid.uuid1())
    csv_table = pd.read_csv(file_path)
    data = pd.read_csv(file_path).values
    x = eggroll.table('fata_script_test_data_x_' + str(RuntimeInstance.FEDERATION.role + str(RuntimeInstance.FEDERATION.job_id)), ns, partition=2, persistent=True)
    if 'y' in list(csv_table.columns.values):
        data_index = 2
    else:
        data_index = 1
    for i in range(np.shape(data)[0]):
        x.put(data[i][0], data[i][data_index:])
    return TensorInEgg(RuntimeInstance.FEDERATION.encrypt_operator, None, x)
示例#28
0
def read_model(buffer_type, proto_buffer, name, namespace):
    data_table = eggroll.table(name=name,
                               namespace=namespace,
                               partition=get_model_table_partition_count(),
                               create_if_missing=False,
                               error_if_exist=False)
    if data_table:
        buffer_bytes = data_table.get(buffer_type, use_serialize=False)
        proto_buffer.ParseFromString(buffer_bytes)
        return True
    else:
        return False
示例#29
0
    def read_data(self, table_name, namespace):
        input_data = eggroll.table(table_name, namespace)
        LOGGER.info("start to read data and change data to instance")

        params = [self.delimitor, self.data_type, self.missing_fill,
                  self.default_value, self.with_label, self.label_idx,
                  self.label_type, self.output_format]

        to_instance_with_param = functools.partial(self.to_instance, params)
        data_instance = input_data.mapValues(to_instance_with_param)

        return data_instance
    def load_model(self, model_table, model_namespace):
        LOGGER.info("load model")
        modelmeta = list(
            eggroll.table(model_table, model_namespace).collect())[0][1]
        self.task_type = modelmeta.task_type
        self.loss_type = modelmeta.loss_type
        self.tree_dim = modelmeta.tree_dim
        self.num_classes = modelmeta.num_classes
        self.trees_ = modelmeta.trees_
        self.classes_ = modelmeta.classes_
        self.history_loss = modelmeta.loss

        self.set_loss(self.loss_type)