def __init__(self, filename=None): """ :param filename: if provided, cache parameters trained """ self.filename = filename self.rmat = None self.users = None self.items = None self.log = LogUtil.getLogger(self.__class__.__name__) super(Predictor, self).__init__()
def get_data(filename, columns, delimiter='::'): """ :param filename: path of data source :param columns: column name for each column :param delimiter: delimiter to split a line :return: dataframe """ log = LogUtil.getLogger('get_data') clock = Timer() with open(os.path.join(filename), 'r') as infile: data = infile.readlines() df = pd.DataFrame([row.rstrip().split(delimiter) for row in data], columns=columns) e0 = clock.restart() log.info("loading data from %s with columns %s takes %.3f secs ", filename, columns, e0) return df
def train_test_split(ratings, frac=0.1, group='user', seed=1): """ split data into train and test by frac if group is provide, split date into train and test by frac in each group """ log = LogUtil.getLogger('train_test_split') log.info("start splitting test and train data ...") clock = Timer() if group: ratings_test = ratings.groupby(group).apply( lambda x: x.sample(frac=frac, random_state=seed)) ratings_test.index = ratings_test.index.droplevel(group) else: ratings_test = ratings.sample(frac=frac, random_state=seed) ratings_train = pd.merge(ratings, ratings_test, indicator=True, how='outer').query('_merge=="left_only"').drop( '_merge', axis=1) e0 = clock.restart() log.info("splitting test and train data takes %.3f secs", e0) return ratings_train, ratings_test
from recsys.cf.funksvd import FunkSVD from recsys.utils.data import train_test_split, load_movielen_data from recsys.utils.debug import LogUtil LogUtil.configLog() model = FunkSVD(learning_rate=0.001, reg=0.005, n_epochs=100, n_factors=30) ratings, users, movies = load_movielen_data() training, testing = train_test_split(ratings) model.fit(training) model.eval(testing)
def __init__(self, **kwargs): self.rmat = None self.users = None self.items = None self.log = LogUtil.getLogger(self.__class__.__name__) super(BaseSelector, self).__init__(**kwargs)