예제 #1
0
    def __init__(self):

        mp.set_start_method('spawn')
        self._top_N = cf().path["inference"]["top_N"]
        self._using_gpu = cf().path["inference"]["using_gpu"]
        self._device = torch.device(cf().path["system"]["device"])

        self._client_len = 0
        self._sku = 0
        self._user_data = OrderedDict()
        self._product_data = OrderedDict()

        self.user_col_name = OrderedDict()
        self.whole_user_col_name = OrderedDict()

        self.product_col_name = OrderedDict()
        self.whole_product_col_name = OrderedDict()

        # set process count
        self._num_processes = max(1, int(mp.cpu_count() * 0.6))
        self._num_sampler_processes = max(1, int(mp.cpu_count() * 0.2))

        self._using_gpu = False
        self._sampler_flag = mp.Manager().list()

        self.load_user_raw_data()
        self.load_product_raw_data()
예제 #2
0
    def get_dataloader(self):
        dataloader = data.DataLoader(
            self,
            batch_size=cf().path["data"]["batch_size"],
            shuffle=cf().path["data"]["shuffle"],
            num_workers=cf().path["data"]["num_workers"],
            drop_last=True)

        dataloader_test = data.DataLoader(self,
                                          batch_size=1,
                                          shuffle=False,
                                          num_workers=1,
                                          drop_last=False)

        one = 0
        zero = 0
        for k, (_, _, label) in enumerate(dataloader_test):
            print(k, label)
            print(label.item())
            if label.item() == 1.0:
                one += 1
            else:
                zero += 1
            break
        print(one, zero)
        exit()

        return dataloader
예제 #3
0
    def sampler(self, index, num_process, raw_data_queue):
        """
        todo = inference queue에 데이터 넣어주기
        client, item 데이터로부터 inference data 조합을 생성하는 함수를 내부에서 call
        single process로 샘플러를 돌릴시에 속도이슈가 발생할 수 있음.
        멀티로 돌릴때와 싱글일 때의 케이스로 1차분기,
        멀티일 경우 마지막 샘플러는 feeding end sign을 큐에 넣어준다.
        :param index: sampler's index
        :param num_process: # of inferences
        :param raw_data_queue:
        :return: None
        """

        offset = int(self._client_len / self._num_sampler_processes)
        start = index * offset

        if self._num_sampler_processes != 1:
            try:
                # 마지막 샘플러가 아닐경우
                if index < self._num_sampler_processes - 1:
                    for i in range(start, start + offset):
                        raw_data_queue.put(self.load_inf_data(i), block=True)
                    # job end
                    self._sampler_flag.append([1])

                # 마지막 샘플러일 경우
                else:
                    for i in range(start, self._client_len):
                        raw_data_queue.put(self.load_inf_data(i), block=True)

                    # 마지막놈은 다른 샘플러작업이 종료되면 end 넣고 나와
                    while True:
                        if len(self._sampler_flag) == (
                                self._num_sampler_processes - 1):
                            for idx in range(num_process):
                                raw_data_queue.put(
                                    cf().path["inference"]["feeding_end"],
                                    block=True)
                            break

            except Exception as e:
                print("sampler", e)
                sys.exit(1)

        else:
            try:
                for i in range(self._client_len):
                    data = self.load_inf_data(i)
                    raw_data_queue.put(data, block=True)

                for end in range(num_process):
                    raw_data_queue.put(cf().path["inference"]["feeding_end"],
                                       block=True)

            except Exception as e:
                print("p_sampler", e)
                sys.exit(1)
예제 #4
0
    def load_model(self):
        """
        todo inference
        :return:
        """
        try:
            if os.path.exists(cf().path["system"]["model_save_path"]):
                # 일치하는 키만 가져오도록
                self.model.load_state_dict(torch.load(
                    cf().path["system"]["model_save_path"]),
                                           strict=False)
                self.model.eval()

        except Exception as e:
            print("load_model", e)
예제 #5
0
    def save_model(self):
        try:
            torch.save(self.model.state_dict(),
                       cf().path["system"]["model_save_path"],
                       pickle_protocol=4)

        except Exception as e:
            print("save_model", e)
            sys.exit()
예제 #6
0
    def ln_top_generator(self):
        """
        top layer 생성 자동화
        :return: top layer frame
        """

        ln_top = []
        j = cf().path["model_parameter"]["ln_bot_output_layer"]
        s = self.dataset.sparse_col_len

        m = cf().path["model_parameter"]["m_spa"]
        n = int((s * m + j) / j)
        y = j + sum(n for n in range(1, n))

        #print(j,s,m,n,y)
        ln_top.append(y)
        ln_top.append(int(y / 2))
        ln_top.append(int(y / 4))
        ln_top.append(1)
        ln_top = np.array(ln_top)

        return ln_top
예제 #7
0
    def load_my_state_dict(self):
        """
        todo mimicking transfer learning
        :return:
        """
        try:
            current_state = self.model.state_dict()

            if os.path.exists(cf().path["system"]["model_save_path"]):
                saved_state = torch.load(
                    cf().path["system"]["model_save_path"])
                print("loading saved model states...")
                for name, saved_param in saved_state.items():
                    if name not in current_state:
                        continue

                    if current_state[name].shape == saved_param.shape:
                        current_state[name].copy_(saved_param)

                self.model.load_state_dict(current_state, strict=False)
                self.model.eval()

        except Exception as e:
            print("load_my_state_dict", e)
예제 #8
0
    def ln_bot_generator(self):

        ln_bot = []

        start = 0
        for first_layer_key in self.dataset.col_name.keys():
            for third_layer_key in self.dataset.col_name[first_layer_key][
                    "dense"]:

                if (len(self.dataset.col_name[first_layer_key]["dense"]
                        [third_layer_key]) < 1):
                    continue

                start += self.dataset.col_name[first_layer_key]["dense"][
                    third_layer_key].shape[0]

        ln_bot.append(start)
        # output layer = 2
        ln_bot.append(cf().path["model_parameter"]["ln_bot_output_layer"])
        ln_bot = np.array(ln_bot)

        return ln_bot
예제 #9
0
    def load_inf_data(self, client_index):
        """
        user, product 에서 dense, sparse로 별도로 저장
        :param client_index:
        :return:
        """

        try:
            dense = dict()

            sparse = dict()

            result = dict()

            # user data
            for f_l in self.whole_user_col_name.keys():
                for s_l in self.whole_user_col_name[f_l]:
                    for col_order, column_name in enumerate(
                            self.whole_user_col_name[f_l][s_l]):

                        if s_l == "seq":
                            """
                            (배치 사이즈, 시퀀스) shape의 인퍼런스 데이터 확보

                            """
                            if f_l == "dense":
                                dense[column_name] = np.full(
                                    (self._sku, cf().path["data"]["SEQ_LEN"]),
                                    np.array(self._user_data[f_l][s_l]
                                             [client_index][col_order]))
                            else:
                                # print(column_name)
                                # todo offset 나중에 어케관리할지 논의,
                                if column_name == 'offset':
                                    tmp = np.full(
                                        (self._sku),
                                        np.array(self._user_data[f_l][s_l]
                                                 [client_index][col_order]))

                                    # LongTensor 아니면 Embedding_bag에서 쥐랄쥐랄
                                    tmp = torch.LongTensor(tmp)
                                    sparse[column_name] = tmp

                                else:

                                    tmp = np.full(
                                        (self._sku,
                                         cf().path["data"]["SEQ_LEN"]),
                                        np.array(self._user_data[f_l][s_l]
                                                 [client_index][col_order]))

                                    tmp = torch.LongTensor(tmp)
                                    sparse[column_name] = tmp
                        else:
                            if f_l == "dense":
                                dense[column_name] = np.full(
                                    (self._sku),
                                    np.array(self._user_data[f_l][s_l]
                                             [client_index][col_order]))

                            else:
                                tmp = np.full(
                                    (self._sku),
                                    np.array(self._user_data[f_l][s_l]
                                             [client_index][col_order]))
                                tmp = torch.LongTensor(tmp)
                                sparse[column_name] = tmp

            # product
            for f_l in self.whole_product_col_name.keys():
                for s_l in self.whole_product_col_name[f_l]:
                    for col_order, column_name in enumerate(
                            self.whole_product_col_name[f_l][s_l]):

                        tmp = np.transpose(self._product_data[f_l][s_l])

                        if f_l == "dense":
                            dense[column_name] = tmp[col_order]
                        else:
                            tmp = tmp[col_order]
                            tmp = torch.LongTensor(tmp)
                            sparse[column_name] = tmp  #tmp[col_order]

            result['dense'] = dense
            result['sparse'] = sparse
            result['client_index'] = client_index

            return result

        except Exception as e:
            print("load rawdata", e)
예제 #10
0
    def inference(self, model, raw_data_queue, result_queue, top_N):
        """
        "end" 태그가 들어오므로 데이터는 전부 딕셔너리 타입인것은 아님
        $$분기 잘태워$$
        :param model: 추론 모델
        :param raw_data_queue: 받아올 데이터 큐
        :param result_queue: csv로 저장할 데이터 큐
        :param top_N: 반환할 상품 수
        :return: 추천 리스트
        [incs_no ,item_index #0, item_index #1, item_index #2, ... item_index#N]
        """

        proc = os.getpid()
        start_vect = time.time()
        while True:
            try:

                data = raw_data_queue.get(block=True)

                if data == cf().path["inference"]["feeding_end"]:
                    result_queue.put(proc, block=True)
                    break

                else:
                    # get incs_no
                    client_index = data['client_index']
                    with open('config/client_info.json') as json_file:
                        json_data = json.load(json_file)
                        incs_no = json_data[str(client_index)]['incs_no']

                    dense_data = data['dense']
                    sparse_data = data['sparse']

                    dense_x = self.gen_inference_dense_factor(dense_data)

                    lS_i, lS_o = self.gen_inference_sparse_factor(sparse_data)

                    unsorted_score = model.dlrm_wrap(
                        dense_x, lS_o, lS_i, self._using_gpu,
                        cf().path["system"]["device"]).detach().cpu().numpy()

                    # 정렬을 위해 딕셔너리로 변환
                    ordered_item_indices = dict()
                    for idx, item in enumerate(unsorted_score):
                        ordered_item_indices[idx] = item
                    # 정렬
                    # np.sort가 훨씬빠르다.???????????????
                    sdict = sorted(ordered_item_indices.items(),
                                   key=operator.itemgetter(1),
                                   reverse=True)

                    # top N개 추출해서 딕셔너리 value로
                    items = list()
                    cnt = 0

                    items.append(incs_no)
                    for idx, item in enumerate(sdict):

                        with open('config/item_info.json') as json_file:
                            json_data = json.load(json_file)
                            prd_nm = json_data[str(sdict[idx][0])]['prd_nm']

                        items.append(prd_nm)
                        cnt += 1
                        if cnt >= top_N:
                            break

                    result_queue.put(items, block=True)

            except Exception as e:
                print("inference error ", e)

        print(
            f"{proc}'s serving Runtime: {(time.time() - start_vect) / 60} Minutes"
        )
예제 #11
0
    def __init__(self, device):

        self.learning_rate = cf().path["model_parameter"]["learning_rate"]

        # tensorboard
        self.emb_l_colName = list()

        self.dataset = TrainData()
        self.dataloader = self.dataset.get_dataloader()

        print(colored("generating DLRM frame", "yellow"))
        self.ln_emb = self.ln_emb_generator()
        self.ln_bot = self.ln_bot_generator()
        self.ln_top = self.ln_top_generator()

        self.model = DLRM_Net(
            cf().path["model_parameter"]["m_spa"],
            self.ln_emb,
            self.ln_bot,
            self.ln_top,
            arch_interaction_op=cf().path["model_parameter"]
            ["arch_interaction_op"],
            arch_interaction_itself=cf().path["model_parameter"]
            ["arch_interaction_itself"],
            sigmoid_bot=cf().path["model_parameter"]["sigmoid_bot"],
            sigmoid_top=self.ln_top.size - 2,
            sync_dense_params=cf().path["model_parameter"]
            ["sync_dense_params"],
            loss_threshold=cf().path["model_parameter"]["loss_threshold"],
            ndevices=cf().path["model_parameter"]["ndevices"],
            qr_flag=cf().path["model_parameter"]["qr_flag"],
            qr_operation=cf().path["model_parameter"]["qr_operation"],
            qr_collisions=cf().path["model_parameter"]["qr_collisions"],
            qr_threshold=cf().path["model_parameter"]["qr_threshold"],
            md_flag=cf().path["model_parameter"]["md_flag"],
            md_threshold=cf().path["model_parameter"]["md_threshold"])
예제 #12
0
    def gen_sparse_factor(data):
        """
        순서는 user - product, single - seq each
        :param data:
        :return:
        """

        lS_i = list()
        user_lS_o = list()
        prod_lS_o = list()

        seq_cnt = 0
        single_cnt = 0

        user_cols = recsys.dataset.col_name['user']['sparse']
        try:
            import itertools as it
            batch_size = cf().path["data"]["batch_size"]

            # offset list를 변형 가능한 꼴로 변환시켜
            seq_offset = np.array(data["offset"].view(-1))

            # print("seq_offset", seq_offset)
            # user
            user_single_data = list()
            user_seq_data = list()

            for key in user_cols.keys():

                if key == "single":

                    for column_name in user_cols[key]:
                        single_cnt += 1
                        #user_single_data.append(data[column_name])
                        user_single_data.append(data[column_name])

                elif key == "seq":
                    """
                    각 컬럼에 배치 사이즈만큼의 길이씩 원소를 추가해간다. 
                    """

                    for column_name in user_cols[key]:
                        seq_cnt += 1

                        seq_items = list()

                        for i in range(batch_size):
                            temp = data[column_name][i]
                            temp = temp[temp.nonzero().squeeze().detach()]
                            temp = temp.view(-1)

                            seq_items.append(temp)

                        seq_items = torch.cat(seq_items)

                        user_seq_data.append(seq_items)

            lS_i = user_single_data + user_seq_data

            # offset 설정, 마지막 시퀀스 길이는 알필요없음
            seq_offset = list(it.accumulate(seq_offset[:-1]))
            # offset starts with zero
            seq_offset.insert(0, 0)
            # print("seq_offset", seq_offset)

            for i in range(single_cnt):
                tmp = [i for i in range(batch_size)]
                user_lS_o.append(tmp)

            for i in range(seq_cnt):
                user_lS_o.append(seq_offset)

        except Exception as e:
            print("user gen_sparse_factor", e)

        # product
        seq_cnt = 0
        single_cnt = 0
        prod_cols = recsys.dataset.col_name['product']['sparse']

        try:
            import itertools as it
            batch_size = cf().path["data"]["batch_size"]

            # offset list를 변형 가능한 꼴로 변환시켜
            #seq_offset = np.array(data["offset"].view(-1))

            # Padded at Head
            seq_offset = np.array(data["offset"].view(-1))
            #

            # product
            prod_single_data = list()
            prod_seq_data = list()

            for key in prod_cols.keys():

                if key == "single":

                    for column_name in prod_cols[key]:
                        single_cnt += 1
                        #print(column_name)
                        # prod_single_data.append(data[column_name])
                        prod_single_data.append(data[column_name])

                elif key == "seq":

                    for column_name in prod_cols[key]:

                        seq_cnt += 1
                        seq_items = list()

                        for i in range(batch_size):
                            # original
                            # seq_items.append(data[column_name][i])
                            temp = data[column_name][i]
                            temp = temp[temp.nonzero().squeeze().detach()]
                            temp = temp.view(-1)

                            seq_items.append(temp)

                        seq_items = torch.cat(seq_items)
                        prod_seq_data.append(seq_items)

            # product ls_i
            prd_ls_i = prod_single_data + prod_seq_data

            lS_i += prd_ls_i

            #print(seq_offset)
            # offset 설정, 마지막 시퀀스 길이는 알필요없음
            # seq_offset = list(it.accumulate(seq_offset[:-1]))
            # Padded at Head
            seq_offset = list(it.accumulate(seq_offset[:-1]))
            # print(seq_offset)
            #

            # offset starts with zero
            seq_offset.insert(0, 0)

            for i in range(single_cnt):
                tmp = [i for i in range(batch_size)]
                prod_lS_o.append(tmp)

            for i in range(seq_cnt):
                prod_lS_o.append(seq_offset)

            lS_o = user_lS_o + prod_lS_o

            lS_o = torch.LongTensor(lS_o)

            return lS_i, lS_o

        except Exception as e:
            print("")
            print("prod gen_sparse_factor", e)
            print("data : ", data)
            print("prod_single_data : ", prod_single_data)
            print("prod_seq_data : ", prod_seq_data)
예제 #13
0
def train():
    def gen_dense_factor(data):
        """
        dense data 중 시퀀스인 애들은 avg해서 쓴다.
        single val -> 그대로 사용

        :param data: input train data, type = dict
        :return:
        """

        try:
            items = list()

            for key in data.keys():

                tmp = np.array(data[key])

                # list type -> avg
                if len(tmp.shape) > 1:
                    tmp += 1

                    seq_avg = np.true_divide(tmp.sum(1), (tmp != 0).sum(1))
                    items.append(seq_avg)

                    # tmp = np.mean(tmp, axis=1)
                    # items.append(tmp)
                else:
                    items.append(tmp)

            items = np.array(items)
            items = items.transpose()

            result = torch.Tensor(items)

            return result
        except Exception as e:
            print("gen_dense_factor", e)

    def gen_sparse_factor(data):
        """
        순서는 user - product, single - seq each
        :param data:
        :return:
        """

        lS_i = list()
        user_lS_o = list()
        prod_lS_o = list()

        seq_cnt = 0
        single_cnt = 0

        user_cols = recsys.dataset.col_name['user']['sparse']
        try:
            import itertools as it
            batch_size = cf().path["data"]["batch_size"]

            # offset list를 변형 가능한 꼴로 변환시켜
            seq_offset = np.array(data["offset"].view(-1))

            # print("seq_offset", seq_offset)
            # user
            user_single_data = list()
            user_seq_data = list()

            for key in user_cols.keys():

                if key == "single":

                    for column_name in user_cols[key]:
                        single_cnt += 1
                        #user_single_data.append(data[column_name])
                        user_single_data.append(data[column_name])

                elif key == "seq":
                    """
                    각 컬럼에 배치 사이즈만큼의 길이씩 원소를 추가해간다. 
                    """

                    for column_name in user_cols[key]:
                        seq_cnt += 1

                        seq_items = list()

                        for i in range(batch_size):
                            temp = data[column_name][i]
                            temp = temp[temp.nonzero().squeeze().detach()]
                            temp = temp.view(-1)

                            seq_items.append(temp)

                        seq_items = torch.cat(seq_items)

                        user_seq_data.append(seq_items)

            lS_i = user_single_data + user_seq_data

            # offset 설정, 마지막 시퀀스 길이는 알필요없음
            seq_offset = list(it.accumulate(seq_offset[:-1]))
            # offset starts with zero
            seq_offset.insert(0, 0)
            # print("seq_offset", seq_offset)

            for i in range(single_cnt):
                tmp = [i for i in range(batch_size)]
                user_lS_o.append(tmp)

            for i in range(seq_cnt):
                user_lS_o.append(seq_offset)

        except Exception as e:
            print("user gen_sparse_factor", e)

        # product
        seq_cnt = 0
        single_cnt = 0
        prod_cols = recsys.dataset.col_name['product']['sparse']

        try:
            import itertools as it
            batch_size = cf().path["data"]["batch_size"]

            # offset list를 변형 가능한 꼴로 변환시켜
            #seq_offset = np.array(data["offset"].view(-1))

            # Padded at Head
            seq_offset = np.array(data["offset"].view(-1))
            #

            # product
            prod_single_data = list()
            prod_seq_data = list()

            for key in prod_cols.keys():

                if key == "single":

                    for column_name in prod_cols[key]:
                        single_cnt += 1
                        #print(column_name)
                        # prod_single_data.append(data[column_name])
                        prod_single_data.append(data[column_name])

                elif key == "seq":

                    for column_name in prod_cols[key]:

                        seq_cnt += 1
                        seq_items = list()

                        for i in range(batch_size):
                            # original
                            # seq_items.append(data[column_name][i])
                            temp = data[column_name][i]
                            temp = temp[temp.nonzero().squeeze().detach()]
                            temp = temp.view(-1)

                            seq_items.append(temp)

                        seq_items = torch.cat(seq_items)
                        prod_seq_data.append(seq_items)

            # product ls_i
            prd_ls_i = prod_single_data + prod_seq_data

            lS_i += prd_ls_i

            #print(seq_offset)
            # offset 설정, 마지막 시퀀스 길이는 알필요없음
            # seq_offset = list(it.accumulate(seq_offset[:-1]))
            # Padded at Head
            seq_offset = list(it.accumulate(seq_offset[:-1]))
            # print(seq_offset)
            #

            # offset starts with zero
            seq_offset.insert(0, 0)

            for i in range(single_cnt):
                tmp = [i for i in range(batch_size)]
                prod_lS_o.append(tmp)

            for i in range(seq_cnt):
                prod_lS_o.append(seq_offset)

            lS_o = user_lS_o + prod_lS_o

            lS_o = torch.LongTensor(lS_o)

            return lS_i, lS_o

        except Exception as e:
            print("")
            print("prod gen_sparse_factor", e)
            print("data : ", data)
            print("prod_single_data : ", prod_single_data)
            print("prod_seq_data : ", prod_seq_data)

    def loss_fn_wrap(Z, T, use_gpu, device):
        if use_gpu:
            return loss_fn(Z, T.to(device))
        else:
            return loss_fn(Z, T)

    using_gpu = False

    # gpu 장비를 사용하는지에 따라 하드웨어 속성변경
    # if torch.cuda.is_available():
    #     # torch.cuda.manual_seed_all(args.numpy_rand_seed)
    #     # torch.backends.cudnn.deterministic = True
    #     device = torch.device("cuda", 7)
    #     using_gpu = True
    # else:

    device = torch.device("cpu")
    recsys = Recsys(device)
    # recsys.load_my_state_dict()

    writer = SummaryWriter()

    #for param_tensor in recsys.model.state_dict():
    #    print("", colored(f"{param_tensor}", "blue", attrs=["bold"]), colored(f"{recsys.model.state_dict()[param_tensor].size()}", "blue", attrs=["bold"]))

    learning_rate = cf().path["model_parameter"]["learning_rate"]

    loss_fn = torch.nn.BCEWithLogitsLoss()

    optimizer = torch.optim.Adam(recsys.model.parameters(), lr=learning_rate)
    print(colored(f"DLRM frame generate done", 'yellow'), "\n")

    # 옵티마이저의 state_dict 출력
    # print("Optimizer's state_dict:")
    # for var_name in optimizer.state_dict():
    #     print(var_name, "\t", optimizer.state_dict()[var_name])

    total_iter = 0
    epochs = cf().path["data"]["epoch"]
    # epoch
    k = 0
    start_vect = time.time()

    best_model_wts = copy.deepcopy(recsys.model.state_dict())
    best_acc = 0.0

    print(colored("MODEL TRAINING START", "yellow", attrs=["underline"]), "\n")
    START_TIME = time.time()

    M = eval()

    M.add_emb(recsys, writer)

    recsys.model.train()  # 모델을 학습 모드로 설정

    with torch.autograd.profiler.profile(False, False) as prof:
        try:
            while k < epochs:
                total_iter = 0
                k += 1

                for it, (dense_data, sparse_data,
                         label) in enumerate(recsys.dataloader):

                    dense_x = gen_dense_factor(dense_data)
                    lS_i, lS_o = gen_sparse_factor(sparse_data)

                    Yhat = recsys.dlrm_wrap(dense_x, lS_o, lS_i, using_gpu,
                                            device)

                    Y = label.type(torch.FloatTensor)

                    E = loss_fn_wrap(Yhat, Y, using_gpu, device)

                    try:
                        optimizer.zero_grad()
                        # backward pass
                        E.backward()
                        # optimizer
                        optimizer.step()

                    except Exception as e:
                        print("weight update error", e)
                        sys.exit(1)

                    if (it % 50 == 0):
                        print(f"{k} epoch , iteration : {it}")
                        print(colored(f"Epoch : {k}", "blue"))
                        print("Yhat : ", Yhat)
                        print("Label : ", label)
                        print("Loss : ", E)
                        current_error = M.metrics(total_iter, E, Yhat, label,
                                                  writer)

                        if best_acc < current_error:
                            best_acc = current_error
                            best_model_wts = copy.deepcopy(
                                recsys.model.state_dict())

                    #recsys.model.load_state_dict(best_model_wts)
                    #recsys.save_model()

            print(
                colored(
                    f"TRAIN RUNTIME: {(time.time() - start_vect) / 60} Min",
                    "yellow",
                    attrs=["underline"]), "\n")

            #distributed_inference()
            M.close(writer)
            sys.exit()

        except Exception as e:
            print("train 도중", e)
예제 #14
0
import os
import time
import sklearn.metrics
import sys
from recsys.Recsys import Recsys

from config.config import config as cf
from distrib_inf_lv import distributed_inference
from sklearn.metrics import auc
from termcolor import colored
import copy
import pandas as pd

device = torch.device("cpu")
recsys = Recsys(device)
recsys.load_my_state_dict()
learning_rate = cf().path["model_parameter"]["learning_rate"]
loss_fn = torch.nn.MSELoss(reduction="mean")
optimizer = torch.optim.SGD(recsys.model.parameters(), lr=learning_rate)

total_iter = 0

k = 0
start_vect = time.time()

best_model_wts = copy.deepcopy(recsys.model.state_dict())
best_acc = 0.0

for data, label in recsys.dataloader:
    print(data, label)
    break
예제 #15
0
    def __init__(self):
        super().__init__()

        self.train_data = None
        self.len = 0

        self._data = OrderedDict()
        self._label = dict()

        # 전체 컬럼들 ,,, offset 등을 포함한다
        self.whole_col_name = OrderedDict()

        self.encoded_list = [
            "cust_grd_nm", "dvce_tp_cd", "emp_yn", "prd_brnd_nm", "prd_cd",
            "prd_tp_cat_vl", "sex_cd"
        ]
        self.encoding_dict = self.read_encode_dict(
            prefix="/Users/amore/ap-recsys-model/tb_recommend_raw",
            cols=self.encoded_list)

        self.items_dataset = self.read_parquets(
            "/Users/amore/ap-recsys-model/tb_recommend_raw/item_meta"
        ).set_index('prd_cd')
        self.items_dataset['prd_cd'] = self.items_dataset.index
        self.users_dataset = self.read_parquets(
            "/Users/amore/ap-recsys-model/tb_recommend_raw/user_meta")

        self.users_dataset['label'] = 1

        self.negative_labels(
            path='/Users/amore/ap-recsys-model/tb_recommend_raw/neg_sample')
        self.total_dataset_length = len(self.users_dataset)

        self.dataset = self.users_dataset[[
            'age', 'dvce_tp_cd', 'sex_cd', 'emp_yn', 'cust_grd_nm', 'seq_cnt',
            'prd_cd', 'prd_brnd_nm', 'prd_norm_prc', 'prd_tp_cat_vl',
            'tg_prd_cd', 'tg_prd_brnd_nm', 'tg_prd_norm_prc',
            'tg_prd_tp_cat_vl', 'label'
        ]]

        self.first_layer = ['user', 'product']
        self.second_layer = ['dense', 'sparse']
        self.third_layer = ['single', 'seq']

        self.feature_dict = OrderedDict()
        self.feature_dict['user'] = OrderedDict()
        self.feature_dict['product'] = OrderedDict()
        self.feature_dict['user']['dense'] = OrderedDict()
        self.feature_dict['user']['dense'] = OrderedDict()
        self.feature_dict['user']['sparse'] = OrderedDict()
        self.feature_dict['user']['sparse'] = OrderedDict()
        self.feature_dict['product']['dense'] = OrderedDict()
        self.feature_dict['product']['dense'] = OrderedDict()
        self.feature_dict['product']['sparse'] = OrderedDict()
        self.feature_dict['product']['sparse'] = OrderedDict()

        self.feature_dict['user']['dense']['single'] = np.array(['age'])
        self.feature_dict['user']['dense']['seq'] = np.array([])
        self.feature_dict['user']['sparse']['single'] = np.array(
            ['dvce_tp_cd', 'sex_cd', 'emp_yn', 'cust_grd_nm'])
        self.feature_dict['user']['sparse']['seq'] = np.array([])
        self.feature_dict['product']['dense']['single'] = np.array(
            ['tg_prd_norm_prc'])
        self.feature_dict['product']['dense']['seq'] = np.array(
            ['prd_norm_prc'])
        self.feature_dict['product']['sparse']['single'] = np.array(
            ['tg_prd_cd', 'tg_prd_brnd_nm', 'tg_prd_tp_cat_vl'])
        self.feature_dict['product']['sparse']['seq'] = np.array(
            ['prd_cd', 'prd_brnd_nm', 'prd_tp_cat_vl'])
        # self.feature_dict['seq_cnt'] = np.array(['seq_cnt'])

        self.write_unique_file()
        self.sparse_col_len = (
            len(self.feature_dict['user']['sparse']['single']) +
            len(self.feature_dict['user']['sparse']['seq']) +
            len(self.feature_dict['product']['sparse']['single']) +
            len(self.feature_dict['product']['sparse']['seq']))
        self.col_name = self.feature_dict

        self.batch_size = cf().path["data"]["batch_size"]
        self.shuffle = cf().path["data"]["shuffle"]
        self.num_workers = cf().path["data"]["num_workers"]
        self.drop_last = True