示例#1
0
class Trainer(object):
    def __init__(self, dataset):
        """
        dataset: object of DataInputer
        """
        self.dataset = dataset
        self.model = LambdaRank()

    def __call__(self):
        self.run()

    def run(self):
        """
        record is a string line
        """
        val_maps = []
        for val_idx in xrange(TRAIN_SET_NUM):
            # user ith dataset as a validate dataset
            self.val_idx = val_idx
            set_indexs = set(range(TRAIN_SET_NUM))
            set_indexs.discard(val_idx)

            self.train(set_indexs)
            val_res = self.validate()
            show_status(".. get map: " + str(val_res)) 
            val_maps.append(val_res)
        map_res = sum(val_maps) / TRAIN_SET_NUM
        show_status(".. get avage map: " + str(map_res)) 
        self.model.dataspace.tofile(data_path('models', str(map_res)))

    def train(self, set_indexs):
        # train using the rest dataset
        for i in list(set_indexs):
            show_status(">>>" * 20)
            show_status(".. training %dth dataset" % i)
            for dataset in self.dataset.get_dataset(i):
                dataset_len = len(self.dataset.train_pairs[i])
                show_status("dataset len: %d" % dataset_len)
                for i, (X1, X2) in enumerate(dataset): 
                    print "train %dth line" % i
                    show_status(">> training data", i, dataset_len)
                    X1 = np.array([float(i) for i in X1.split()])
                    X2 = np.array([float(i) for i in X2.split()])
                    self.model.study_line(X1, X2)


    def validate(self):
        """
        validate and save best MAP
        """
        def mysort(l1, l2):
            if l1[1] == l2[1]:
                return 0
            if l1[1] > l2[1]:
                return -1
            return 1
        # TODO how to validate?
        vali_set = self.dataset.trainset[self.val_idx]
        uid, p_papers, n_papers = vali_set.split(',')
        uid = int(uid)
        p_papers = [int(i) for i in p_papers]
        n_papers = [int(i) for i in n_papers]
        predicts = []
        for p in p_papers + n_papers:
            p_feature = self.dataset.get_data_line(uid, p)
            score = self.model.predict(p_feature)
            predicts.append((p, score))
        predicts.sort(mysort)
        return cal_map(p_papers, predicts)
示例#2
0
 def __init__(self, dataset):
     """
     dataset: object of DataInputer
     """
     self.dataset = dataset
     self.model = LambdaRank()
示例#3
0
class Trainer(object):
    def __init__(self, dataset):
        """
        dataset: object of DataInputer
        """
        self.dataset = dataset
        self.model = LambdaRank()

    def __call__(self):
        self.run()

    def run(self):
        """
        record is a string line
        """
        val_maps = []
        for val_idx in xrange(TRAIN_SET_NUM):
            # user ith dataset as a validate dataset
            self.val_idx = val_idx
            set_indexs = set(range(TRAIN_SET_NUM))
            set_indexs.discard(val_idx)

            self.train(set_indexs)
            val_res = self.validate()
            show_status(".. get map: " + str(val_res))
            val_maps.append(val_res)
        map_res = sum(val_maps) / TRAIN_SET_NUM
        show_status(".. get avage map: " + str(map_res))
        self.model.dataspace.tofile(data_path('models', str(map_res)))

    def train(self, set_indexs):
        # train using the rest dataset
        for i in list(set_indexs):
            show_status(">>>" * 20)
            show_status(".. training %dth dataset" % i)
            for dataset in self.dataset.get_dataset(i):
                dataset_len = len(self.dataset.train_pairs[i])
                show_status("dataset len: %d" % dataset_len)
                for i, (X1, X2) in enumerate(dataset):
                    print "train %dth line" % i
                    show_status(">> training data", i, dataset_len)
                    X1 = np.array([float(i) for i in X1.split()])
                    X2 = np.array([float(i) for i in X2.split()])
                    self.model.study_line(X1, X2)

    def validate(self):
        """
        validate and save best MAP
        """
        def mysort(l1, l2):
            if l1[1] == l2[1]:
                return 0
            if l1[1] > l2[1]:
                return -1
            return 1

        # TODO how to validate?
        vali_set = self.dataset.trainset[self.val_idx]
        uid, p_papers, n_papers = vali_set.split(',')
        uid = int(uid)
        p_papers = [int(i) for i in p_papers]
        n_papers = [int(i) for i in n_papers]
        predicts = []
        for p in p_papers + n_papers:
            p_feature = self.dataset.get_data_line(uid, p)
            score = self.model.predict(p_feature)
            predicts.append((p, score))
        predicts.sort(mysort)
        return cal_map(p_papers, predicts)