class ApplyDimensionReductionModel(gokart.TaskOnKart): task_namespace = 'redshells.word_item_similarity' item2embedding_task = gokart.TaskInstanceParameter( description= 'A task outputs item2embedding data with type = Dict[Any, np.ndarray].' ) dimension_reduction_model_task = gokart.TaskInstanceParameter( default='A task outputs a model instance of `DimensionReductionModel`.' ) l2_normalize = luigi.BoolParameter() # type: bool output_file_path = luigi.Parameter( default='app/word_item_similarity/dimension_reduction_model.pkl' ) # type: str def requires(self): return dict(item2embedding=self.item2embedding_task, model=self.dimension_reduction_model_task) def output(self): return self.make_target(self.output_file_path) def run(self): item2embedding = self.load( 'item2embedding') # type: Dict[Any, np.ndarray] model = self.load('model') items = list(item2embedding.keys()) embeddings = model.apply(np.array(list(item2embedding.values()))) if self.l2_normalize: embeddings = sklearn.preprocessing.normalize(embeddings, axis=1, norm='l2') self.dump(dict(zip(items, list(embeddings))))
class TrainLdaModel(gokart.TaskOnKart): output_file_path = luigi.Parameter( default='model/lda_model.pkl') # type: str tokenized_text_data_task = gokart.TaskInstanceParameter( description= 'A task outputs tokenized texts with type "List[List[str]]".') dictionary_task = gokart.TaskInstanceParameter( description='A task outputs gensim.corpura.Dictionary.') lda_model_kwargs = luigi.DictParameter( default=dict(n_topics=100, chunksize=16, decay=0.5, offset=16, iterations=3, eta=1.e-16), description='Arguments for redshells.model.LdaModel.' ) # type: Dict[str, Any] def requires(self): return dict(tokenized_texts=self.tokenized_text_data_task, dictionary=self.dictionary_task) def output(self): return self.make_target(self.output_file_path) def run(self): tokenized_texts = self.load('tokenized_texts') # type: List[List[str]] dictionary = self.load('dictionary') # type: gensim.corpora.Dictionary model = redshells.model.LdaModel(**self.lda_model_kwargs) model.fit(texts=tokenized_texts, dictionary=dictionary) self.dump(model)
class CalculateSimilarityWithMatrixFactorization(gokart.TaskOnKart): """Calculate similarity between items using latent factors which are calculated by Matrix Factorization. """ task_namespace = 'redshells.word_item_similarity' target_item_task = gokart.TaskInstanceParameter( description='A task outputs item ids as type List.') matrix_factorization_task = gokart.TaskInstanceParameter( description='A task instance of `TrainMatrixFactorization`.') normalize = luigi.BoolParameter( description='Normalize item factors with l2 norm.') # type: bool batch_size = luigi.IntParameter(default=1000, significant=False) output_file_path = luigi.Parameter( default= 'app/word_item_similarity/calculate_similarity_with_matrix_factorization.zip' ) # type: str def requires(self): assert type(self.matrix_factorization_task) == redshells.train.TrainMatrixFactorization,\ f'but actually {type(self.matrix_factorization_task)} is passed.' return dict(data=self.target_item_task, model=self.matrix_factorization_task) def output(self): return self.make_large_data_frame_target(self.output_file_path) def run(self): tf.reset_default_graph() data = self.load('data') # type: List model = self.load('model') # type: redshells.model.MatrixFactorization data = list(set(data)) item_ids = model.get_valid_item_ids(data) factors = model.get_item_factors(item_ids, normalize=self.normalize) # Usually, ths size of item_ids is too large to calculate similarities at once. So I split data. split_size = factors.shape[0] // self.batch_size + 1 factors_sets = np.array_split(factors, split_size) item_ids_sets = np.array_split(item_ids, split_size) def _calculate(x, y, x_ids, y_ids): if np.array_equal(x_ids, y_ids): indices = np.triu_indices(x_ids.shape[0], k=1) else: indices_ = np.indices([x_ids.shape[0], y_ids.shape[0]]) indices = (indices_[0].flatten(), indices_[1].flatten()) df = pd.DataFrame({ 'item_id_0': list(x_ids[indices[0]]), 'item_id_1': list(y_ids[indices[1]]), 'similarity': list(np.dot(x, y.T)[indices]) }) return df results = pd.concat([ _calculate(factors_sets[i], factors_sets[j], item_ids_sets[i], item_ids_sets[j]) for i, j in tqdm( list( itertools.combinations_with_replacement( range(split_size), 2))) ]) self.dump(results)
class CalculateDocumentEmbedding(gokart.TaskOnKart): """ Calculate document embeddings """ task_namespace = 'redshells.word_item_similarity' document_task = gokart.TaskInstanceParameter() scdv_task = gokart.TaskInstanceParameter() item_id_column_name = luigi.Parameter() # type: str document_column_name = luigi.Parameter() # type: str l2_normalize = luigi.BoolParameter() # type: bool output_file_path = luigi.Parameter( default='app/word_item_similarity/calculate_document_embedding.pkl' ) # type: str def requires(self): return dict(document=self.document_task, scdv=self.scdv_task) def output(self): return self.make_target(self.output_file_path) def run(self): scdv = self.load('scdv') document = self.load_data_frame('document', required_columns={ self.item_id_column_name, self.document_column_name }) documents = document[self.document_column_name].tolist() embeddings = scdv.infer_vector(documents, l2_normalize=self.l2_normalize) self.dump( dict( zip(document[self.item_id_column_name].tolist(), list(embeddings))))
class FilterItemByWordSimilarity(gokart.TaskOnKart): word2items_task = gokart.TaskInstanceParameter() word2embedding_task = gokart.TaskInstanceParameter() item2title_embedding_task = gokart.TaskInstanceParameter() no_below = luigi.FloatParameter() output_file_path = luigi.Parameter( default='app/word_item_similarity/filter_item_by_word_similarity.pkl' ) # type: str def requires(self): return dict(word2items=self.word2items_task, word2embedding=self.word2embedding_task, item2title_embedding=self.item2title_embedding_task) def output(self): return self.make_target(self.output_file_path) def run(self): word2items = self.load('word2items') word2embedding = self.load('word2embedding') item2title_embedding = self.load('item2title_embedding') filtered_word2items = defaultdict(list) for word, items in word2items.items(): word_embedding = word2embedding[word] for item in items: title_embedding = item2title_embedding[item] if np.inner(word_embedding, title_embedding) > self.no_below: filtered_word2items[word].append(item) self.dump(dict(filtered_word2items))
class MergeData(gokart.TaskOnKart): task_namespace = 'm5-forecasting' calendar_data_task = gokart.TaskInstanceParameter() selling_price_data_task = gokart.TaskInstanceParameter() sales_data_task = gokart.TaskInstanceParameter() def requires(self): return dict(calendar=self.calendar_data_task, selling_price=self.selling_price_data_task, sales=self.sales_data_task) def run(self): calendar = self.load_data_frame('calendar') selling_price = self.load_data_frame('selling_price') sales = self.load_data_frame('sales') output = self._run(calendar, selling_price, sales) self.dump(output) @staticmethod def _run(calendar, selling_price, sales): sales = sales.merge(calendar, how="left", on="d") gc.collect() sales = sales.merge(selling_price, how="left", on=["store_id", "item_id", "wm_yr_wk"]) sales.drop(["wm_yr_wk"], axis=1, inplace=True) gc.collect() del selling_price return sales
class CalculateWordEmbedding(gokart.TaskOnKart): task_namespace = 'redshells.word_item_similarity' word_task = gokart.TaskInstanceParameter() word2item_task = gokart.TaskInstanceParameter() item2embedding_task = gokart.TaskInstanceParameter() output_file_path = luigi.Parameter(default='app/word_item_similarity/calculate_word_embedding.pkl') # type: str def requires(self): return dict(word=self.word_task, word2item=self.word2item_task, item2embedding=self.item2embedding_task) def output(self): return self.make_target(self.output_file_path) def run(self): word_data = self.load('word') word2item = self.load('word2item') item2embedding = self.load('item2embedding') results = {word: self._calculate(word2item[word], item2embedding) for word in word_data if word in word2item} self.dump(results) def _calculate(self, items, item2embedding): embeddings = [item2embedding[item] for item in items if item in item2embedding] if not embeddings: return None return sklearn.preprocessing.normalize([np.sum(embeddings, axis=0)], norm='l2', axis=1)[0]
class TaskD(gokart.TaskOnKart): foo = gokart.TaskInstanceParameter() bar = gokart.TaskInstanceParameter() def run(self): x = self.load('foo') y = self.load('bar') self.dump(x + y + ['D'])
class TrainSCDV(gokart.TaskOnKart): task_namespace = 'redshells' tokenized_text_data_task = gokart.TaskInstanceParameter( description= 'A task outputs tokenized texts with type "List[List[str]]".') dictionary_task = gokart.TaskInstanceParameter( description='A task outputs gensim.corpora.Dictionary.') word2vec_task = gokart.TaskInstanceParameter( description= 'A task outputs gensim.models.Word2Vec, gensim.models.FastText or models with the same interface.' ) cluster_size = luigi.IntParameter( default=60, description='A cluster size of Gaussian mixture model in SCDV.' ) # type: int sparsity_percentage = luigi.FloatParameter( default=0.04, description='A percentage of sparsity in SCDV') # type: float gaussian_mixture_kwargs = luigi.DictParameter( default=dict(), description= 'Arguments for Gaussian mixture model except for cluster size.' ) # type: Dict[str, Any] output_file_path = luigi.Parameter(default='model/scdv.pkl') # type: str text_sample_size = luigi.IntParameter( default=10000, description= 'SCDV uses texts to calculate threshold to make sparse, so not all texts data is required.' ) # type: int def requires(self): return dict(text=self.tokenized_text_data_task, dictionary=self.dictionary_task, word2vec=self.word2vec_task) def output(self): return self.make_target(self.output_file_path) def run(self): texts = self.load('text') # type: List dictionary = self.load('dictionary') # type: gensim.corpora.Dictionary word2vec = self.load('word2vec') # type: gensim.models.Word2Vec if len(texts) > self.text_sample_size: texts = np.random.choice(texts, size=self.text_sample_size) if isinstance(texts[0], str): texts = redshells.train.utils.TokenIterator(texts=texts) model = redshells.model.SCDV( documents=texts, cluster_size=self.cluster_size, sparsity_percentage=self.sparsity_percentage, gaussian_mixture_kwargs=self.gaussian_mixture_kwargs, dictionary=dictionary, w2v=word2vec) self.dump(model)
class _DoubleLoadSubTask(gokart.TaskOnKart): task_namespace = __name__ sub1 = gokart.TaskInstanceParameter() sub2 = gokart.TaskInstanceParameter() def output(self): return self.make_target('sub_task.txt') def run(self): self.dump(f'task uid = {self.make_unique_id()}')
class FindItemKeywordByMatching(gokart.TaskOnKart): """ Find items which include keywords in its value of 'item_keyword_column_name'. Output pd.DataFrame with columns [item_id, keyword]. """ task_namespace = 'redshells.word_item_similarity' target_keyword_task = gokart.TaskInstanceParameter( description='A task outputs keywords as type `List[Any]` or `Set[Any]`.' ) item_task = gokart.TaskInstanceParameter( description= 'A task outputs item data as type `pd.DataFrame` which has `item_id_column_name`.' ) tfidf_task = gokart.TaskInstanceParameter( description='A task instance of TrainTfidf.') keep_top_rate = luigi.FloatParameter( description='A rate to filter words in texts.') # type: float item_id_column_name = luigi.Parameter() # type: str item_keyword_column_name = luigi.Parameter() # type: str output_file_path = luigi.Parameter( default='app/word_item_similarity/find_item_by_keyword_matching.pkl' ) # type: str def requires(self): return dict(keyword=self.target_keyword_task, item=self.item_task, tfidf=self.tfidf_task) def output(self): return self.make_target(self.output_file_path) def run(self): keywords = set(self.load('keyword')) items = self.load_data_frame('item', required_columns={ self.item_id_column_name, self.item_keyword_column_name }) tfidf = self.load('tfidf') # type: redshells.model.Tfidf tokens = items[self.item_keyword_column_name].tolist() top_tokens = [ list(zip(*values))[0] for values in tfidf.apply(tokens=tokens, keep_top_rate=self.keep_top_rate) ] item_ids = items[self.item_id_column_name].tolist() match_keywords = [set(t) & keywords for t in top_tokens] result = pd.DataFrame( dict(item_id=list( itertools.chain.from_iterable( [[item_id] * len(keywords) for item_id, keywords in zip(item_ids, match_keywords)])), keyword=list(itertools.chain.from_iterable(match_keywords)))) self.dump(result)
class CalculateWordItemSimilarity(gokart.TaskOnKart): """ Calculate similarity between words and items. """ task_namespace = 'redshells.word_item_similarity' word2embedding_task = gokart.TaskInstanceParameter() item2embedding_task = gokart.TaskInstanceParameter() similarity_model_task = gokart.TaskInstanceParameter() prequery_return_size = luigi.IntParameter() # type: int return_size = luigi.IntParameter() # type: int output_file_path = luigi.Parameter( default='app/word_item_similarity/calculate_word_item_similarity.pkl') # type: str def requires(self): return dict( word2embedding=self.word2embedding_task, item2embedding=self.item2embedding_task, model=self.similarity_model_task) def output(self): return self.make_target(self.output_file_path) def run(self): word2embedding = self.load('word2embedding') # type: Dict[Any, np.ndarray] item2embedding = self.load('item2embedding') # type: Dict[Any, np.ndarray] model = self.load('model') item_embeddings = np.array(list(item2embedding.values())) items = np.array(list(item2embedding.keys())) results = pd.concat([ self._find_top_similarity(model, word, embedding, items, item_embeddings) for word, embedding in tqdm(word2embedding.items()) ]) self.dump(results.reset_index(drop=True)) def _find_top_similarity(self, model, word, word_embedding: np.ndarray, items: np.ndarray, item_embeddings: np.ndarray) -> pd.DataFrame: if word_embedding is None: logger.info(f'word {word} is not registered.') return pd.DataFrame(columns=['word', 'item', 'similarity']) filtered_indices = self._filter(word_embedding, item_embeddings) similarities = self._predict(model, word_embedding, item_embeddings[filtered_indices, :]) top_indices = similarities.argsort()[-self.return_size:][::-1] return pd.DataFrame( dict(word=word, item=items[filtered_indices[top_indices]], similarity=similarities[top_indices])) def _predict(self, model, word_embedding: np.ndarray, item_embeddings: np.ndarray) -> np.ndarray: i = list(model.classes_).index(1) return model.predict_proba(item_embeddings * word_embedding)[:, i] def _filter(self, word_embedding: np.ndarray, item_embeddings: np.ndarray) -> np.ndarray: similarities = np.dot(item_embeddings, word_embedding.reshape([-1, 1])).flatten() top_indices = similarities.argsort()[-self.prequery_return_size:][::-1] return top_indices
class TrainGraphConvolutionalMatrixCompletion(gokart.TaskOnKart): task_namespace = 'redshells' train_data_task = gokart.TaskInstanceParameter( description='A task outputs a pd.DataFrame with columns={`user_column_name`, `item_column_name`, `target_column_name`}.') user_column_name = luigi.Parameter(default='user', description='The column name of user id.') # type: str item_column_name = luigi.Parameter(default='item', description='The column name of item id') # type: str rating_column_name = luigi.Parameter(default='rating', description='The target column name to predict.') # type: str user_feature_task = gokart.TaskInstanceParameter(default=NoneTask()) item_feature_task = gokart.TaskInstanceParameter(default=NoneTask()) model_kwargs = luigi.DictParameter(default=dict(), description='Arguments of the model.') # type: Dict[str, Any] max_data_size = luigi.IntParameter(default=50000000) # type: int output_file_path = luigi.Parameter(default='model/graph_convolutional_matrix_completion.zip') # type: str try_count = luigi.IntParameter(default=10) # type: int decay_speed = luigi.FloatParameter(default=2.0) # type: float test_size = luigi.FloatParameter(default=0.2) # type: float # data parameters min_user_click_count = luigi.IntParameter(default=5) # type: int max_user_click_count = luigi.IntParameter(default=200) # type: int def requires(self): return dict(train_data=self.train_data_task, user_features=self.user_feature_task, item_features=self.item_feature_task) def output(self): return dict(model=self.make_model_target(self.output_file_path, save_function=GraphConvolutionalMatrixCompletion.save, load_function=GraphConvolutionalMatrixCompletion.load), report=self.make_target('model_report/report.txt')) def run(self): tf.reset_default_graph() df = self.load_data_frame('train_data', required_columns={self.user_column_name, self.item_column_name, self.rating_column_name}) user_features = self.load('user_features') item_features = self.load('item_features') df.drop_duplicates(subset=[self.user_column_name, self.item_column_name], inplace=True) df = sklearn.utils.shuffle(df) df = df.head(n=int(self.max_data_size)) user_ids = df[self.user_column_name].values item_ids = df[self.item_column_name].values ratings = df[self.rating_column_name].values dataset = GcmcDataset(user_ids=user_ids, item_ids=item_ids, ratings=ratings, user_features=user_features, item_features=item_features) graph_dataset = GcmcGraphDataset(dataset=dataset, test_size=self.test_size, min_user_click_count=self.min_user_click_count, max_user_click_count=self.max_user_click_count) model = GraphConvolutionalMatrixCompletion(graph_dataset=graph_dataset, **self.model_kwargs) self.task_log['report'] = [str(self.model_kwargs)] + model.fit(try_count=self.try_count, decay_speed=self.decay_speed) self.dump(self.task_log['report'], 'report') self.dump(model, 'model')
class _PairwiseSimilarityModelTask(gokart.TaskOnKart): item2embedding_task = gokart.TaskInstanceParameter( description='A task outputs a mapping from item to embedding. The output must have type=Dict[Any, np.ndarray].') similarity_data_task = gokart.TaskInstanceParameter( description= 'A task outputs a pd.DataFrame with columns={`item0_column_name`, `item`_column_name`, `similarity_column_name`}. ' '`similarity_column_name` must be binary data.') item0_column_name = luigi.Parameter() # type: str item1_column_name = luigi.Parameter() # type: str similarity_column_name = luigi.Parameter() # type: str model_name = luigi.Parameter( default='XGBClassifier', description='A model name which has "fit" interface, and must be registered by "register_prediction_model".' ) # type: str model_kwargs = luigi.DictParameter( default=dict(), description='Arguments of the model which are created with model_name.') # type: Dict[str, Any] output_file_path = luigi.Parameter(default='model/pairwise_similarity_model.pkl') # type: str def requires(self): return dict(item2embedding=self.item2embedding_task, similarity_data=self.similarity_data_task) def output(self): return self.make_target(self.output_file_path) def create_model(self): return redshells.factory.create_prediction_model(self.model_name, **self.model_kwargs) def create_train_data(self): logger.info('loading input data...') item2embedding = self.load('item2embedding') # type: Dict[Any, np.ndarray] similarity_data = self.load_data_frame( 'similarity_data', required_columns={self.item0_column_name, self.item1_column_name, self.similarity_column_name}) logger.info(f'similarity_data size={similarity_data.shape}') similarity_data = sklearn.utils.shuffle(similarity_data) logger.info('making features...') similarity_data[self.similarity_column_name] = similarity_data[self.similarity_column_name].astype(int) similarity_data = sklearn.utils.shuffle(similarity_data) similarity_data = similarity_data[similarity_data[self.item0_column_name].isin(item2embedding)] similarity_data = similarity_data[similarity_data[self.item1_column_name].isin(item2embedding)] x = np.array([ np.multiply(item2embedding[i1], item2embedding[i2]) for i1, i2 in zip( similarity_data[self.item0_column_name].tolist(), similarity_data[self.item1_column_name].tolist()) ]) y = similarity_data[self.similarity_column_name].tolist() logger.info('done making train data.') logger.info(f'size of x={len(x)}, {len(x[0])}') return x, y
class TrainDictionary(gokart.TaskOnKart): task_namespace = 'redshells' tokenized_text_data_task = gokart.TaskInstanceParameter( description= 'The task outputs tokenized texts with type "List[List[str]]".') output_file_path = luigi.Parameter( default='model/dictionary.pkl') # type: str dictionary_filter_kwargs = luigi.DictParameter( default=dict(no_below=5, no_above=0.5, keep_n=100000, keep_tokens=None), description= 'Arguments for FastText except "sentences". Please see gensim.corpura.FastText for more details.' ) # type: Dict[str, Any] def requires(self): return self.tokenized_text_data_task def output(self): return self.make_target(self.output_file_path) def run(self): texts = self.load() # type: List if isinstance(texts[0], str): texts = redshells.train.utils.TokenIterator(texts=texts) dictionary = gensim.corpora.Dictionary(texts) if len(self.dictionary_filter_kwargs): dictionary.filter_extremes(**self.dictionary_filter_kwargs) self.dump(dictionary)
class TaskC(gokart.TaskOnKart): foo = gokart.TaskInstanceParameter() text = luigi.Parameter() def run(self): x = self.load('foo') self.dump(x + [self.text])
class _FactorizationMachineTask(gokart.TaskOnKart): train_data_task = gokart.TaskInstanceParameter( description= 'A task outputs a pd.DataFrame with columns={`target_column_name`}.') target_column_name = luigi.Parameter( default='category', description='Category column names.') # type: str model_name = luigi.Parameter( default='XGBClassifier', description= 'A model name which has "fit" interface, and must be registered by "register_prediction_model".' ) # type: str model_kwargs = luigi.DictParameter( default=dict(), description='Arguments of the model which are created with model_name.' ) # type: Dict[str, Any] def requires(self): return self.train_data_task def create_model(self): return redshells.factory.create_prediction_model( self.model_name, **self.model_kwargs) def create_train_data(self): data = self.load_data_frame(required_columns={self.target_column_name}) data = sklearn.utils.shuffle(data) y = data[self.target_column_name].astype(int) x = data.drop(self.target_column_name, axis=1) return x, y
class MakePairedData(gokart.TaskOnKart): task_namespace = 'novelty_enhanced_bpr' click_task = gokart.TaskInstanceParameter() positive_sample_weight: int = luigi.IntParameter() distance_threshold: float = luigi.FloatParameter() def requires(self): return self.click_task def run(self): clicks = self.load()['clicks_train'] item_distance = self.load()['item_distance'] paired_data = self._run(clicks, item_distance, self.positive_sample_weight, self.distance_threshold) self.dump(paired_data) @staticmethod def _run(clicks: pd.DataFrame, item_distance: pd.DataFrame, positive_sample_weight: int, distance_threshold: float) -> pd.DataFrame: clicked_data = clicks[clicks['click'].astype(bool)].rename(columns={'item_id': 'positive_item_id'}) not_clicked_data = clicks[~clicks['click'].astype(bool)].rename(columns={'item_id': 'negative_item_id'}) not_clicked_data = not_clicked_data.groupby('user_id').apply( lambda x: x.sample(positive_sample_weight)).reset_index(drop=True) paired_data = pd.merge(clicked_data[['user_id', 'positive_item_id']], not_clicked_data[['user_id', 'negative_item_id']], on='user_id', how='inner') paired_data = pd.merge(paired_data, item_distance, left_on=['positive_item_id', 'negative_item_id'], right_on=['item_id_x', 'item_id_y'], how='inner') if distance_threshold: paired_data = paired_data[paired_data['distance'] < distance_threshold] return paired_data[['user_id', 'positive_item_id', 'negative_item_id']]
class TrainFastText(gokart.TaskOnKart): task_namespace = 'redshells' tokenized_text_data_task = gokart.TaskInstanceParameter( description= 'The task outputs tokenized texts with type `List[List[str]]` or `List[str]` separated with space.' ) fasttext_kwargs = luigi.DictParameter( default=dict(), description= 'Arguments for FastText except "sentences". Please see gensim.models.FastText for more details.' ) # type: Dict[str, Any] output_file_path = luigi.Parameter( default='model/fasttext.zip') # type: str def requires(self): return self.tokenized_text_data_task def output(self): return self.make_model_target( self.output_file_path, save_function=gensim.models.FastText.save, load_function=gensim.models.FastText.load) def run(self): texts = self.load() assert len(texts) > 0 shuffle(texts) if isinstance(texts[0], str): texts = TokenIterator(texts=texts) logger.info(f'training FastText...') model = gensim.models.FastText(sentences=texts, **self.fasttext_kwargs) self.dump(model)
class MakeFeature(gokart.TaskOnKart): task_namespace = 'm5-forecasting' merged_data_task = gokart.TaskInstanceParameter() is_train: bool = luigi.BoolParameter() is_small: bool = luigi.BoolParameter() def requires(self): return dict(data=self.merged_data_task) def run(self): data = self.load_data_frame('data') output = self._run(data, self.is_train) self.dump(output) @classmethod def _run(cls, data, is_train: bool): data = cls._label_encode(data) data = data.dropna(subset={'sell_price'}) if is_train else data return data @staticmethod def _label_encode(data): for i, v in tqdm(enumerate(["item_id", "dept_id", "store_id", "cat_id", "state_id"])): data[v] = OrdinalEncoder(dtype="int").fit_transform(data[[v]]).astype("int16") + 1 return data
class PreprocessCriteo(gokart.TaskOnKart): data_task = gokart.TaskInstanceParameter() def requires(self): return self.data_task def output(self): return self.make_target('criteo/train_data.pkl') def run(self): logger.info('loading...') df = self.load_data_frame() logger.info('preprocess for integer columns...') for c in tqdm(_get_integer_columns()): values = df[c].copy() m = np.min([x for x in values[values.notnull()]]) values[values.notnull()] += -m + 2 values[values.isnull()] = 1 df[c] = np.log(values) logger.info('preprocess for category columns...') for c in _get_categorical_columns(): df[c] = df[c].astype('category') logger.info('dumping...') self.dump(df)
class GetItemDistance(gokart.TaskOnKart): task_namespace = 'novelty_enhanced_bpr' item_embed_vector_task = gokart.TaskInstanceParameter() def requires(self): return self.item_embed_vector_task def run(self): item_embed_vector = self.load() item_embed_vector_x = item_embed_vector.rename( columns={ 'item_id': 'item_id_x', 'item_vector': 'item_vector_x' }) item_embed_vector_y = item_embed_vector.rename( columns={ 'item_id': 'item_id_y', 'item_vector': 'item_vector_y' }) item_distance_df = cross_join(item_embed_vector_x, item_embed_vector_y) def func(vector1, vector2): return np.linalg.norm(vector1 - vector2) item_distance_df['distance'] = item_distance_df.apply( lambda x: func(x['item_vector_x'], x['item_vector_y']), axis=1) self.dump(item_distance_df[['item_id_x', 'item_id_y', 'distance']])
class TrainDoc2Vec(gokart.TaskOnKart): task_namespace = 'redshells' tokenized_text_data_task = gokart.TaskInstanceParameter( description= 'The task outputs tokenized texts with type "List[List[str]]".') output_file_path = luigi.Parameter( default='model/doc2vec.zip') # type: str doc2vec_kwargs = luigi.DictParameter( default=dict(), description= 'Arguments for Doc2Vec except "documents". Please see gensim.models.Doc2Vec for more details.' ) # type: Dict[str, Any] def requires(self): return self.tokenized_text_data_task def output(self): return self.make_model_target(self.output_file_path, save_function=gensim.models.Doc2Vec.save, load_function=gensim.models.Doc2Vec.load) def run(self): texts = self.load() # type: List[List[str]] shuffle(texts) documents = [ gensim.models.doc2vec.TaggedDocument(doc, [i]) for i, doc in enumerate(texts) ] model = gensim.models.Doc2Vec(documents=documents, **self.doc2vec_kwargs) model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) self.dump(model)
class ExtractColumnAsDict(gokart.TaskOnKart): """ Extract column data of pd.DataFrame as dict, and keep the first value when values of `key_column_name` are duplicate. """ task_namespace = 'redshells.data_frame_utils' data_task = gokart.TaskInstanceParameter( description='A task outputs pd.DataFrame.') key_column_name = luigi.Parameter() # type: str value_column_name = luigi.Parameter() # type: str output_file_path = luigi.Parameter( default='data/extract_column_as_dict.pkl') # type: str def requires(self): return self.data_task def output(self): return self.make_target(self.output_file_path) def run(self): data = self.load_data_frame( required_columns={self.key_column_name, self.value_column_name}) data.drop_duplicates(self.key_column_name, keep='first', inplace=True) self.dump( dict( zip(data[self.key_column_name].tolist(), data[self.value_column_name].tolist())))
class ConvertToOneHot(gokart.TaskOnKart): """ Convert column values of `categorical_column_names` to one-hot. """ task_namespace = 'redshells.data_frame_utils' data_task = gokart.TaskInstanceParameter( description='A task outputs pd.DataFrame.') categorical_column_names = luigi.ListParameter() # type: List[str] output_file_path = luigi.Parameter( default='data/group_by_column_as_dict.pkl') # type: str def requires(self): return self.data_task def output(self): return self.make_target(self.output_file_path) def run(self): categorical_column_names = list(self.categorical_column_names) data = self.load_data_frame( required_columns=set(categorical_column_names)) result = pd.get_dummies(data[categorical_column_names]) result = result.merge(data.drop(categorical_column_names, axis=1), left_index=True, right_index=True) self.dump(result)
class GroupByColumnAsDict(gokart.TaskOnKart): """ Group by column names of pd.DataFrame and return map from `key_column_name` to a list of `value_column_name`. **This always drops na values.** """ task_namespace = 'redshells.data_frame_utils' data_task = gokart.TaskInstanceParameter( description='A task outputs pd.DataFrame.') key_column_name = luigi.Parameter() # type: str value_column_name = luigi.Parameter() # type: str output_file_path = luigi.Parameter( default='data/group_by_column_as_dict.pkl') # type: str def requires(self): return self.data_task def output(self): return self.make_target(self.output_file_path) def run(self): data = self.load_data_frame( required_columns={self.key_column_name, self.value_column_name}) data.dropna(subset={self.key_column_name, self.value_column_name}, inplace=True) result = data.groupby(by=self.key_column_name)[ self.value_column_name].apply(list).to_dict() self.dump(result)
class _DummyTask(gokart.TaskOnKart): task_namespace = __name__ sub_task = gokart.TaskInstanceParameter() def output(self): return self.make_target('test.txt') def run(self): self.dump('test')
class TaskB(TaskBase): task = gokart.TaskInstanceParameter() def requires(self): return self.task def run(self): params = self.load() params.update({'trained': True}) # training model self.dump(params)
class LoadDataOfTask(gokart.TaskOnKart): task_namespace = 'redshells' data_task = gokart.TaskInstanceParameter() target_name = luigi.Parameter() def requires(self): return self.data_task def output(self): return self.input()[self.target_name]
class TrainMatrixFactorization(gokart.TaskOnKart): task_namespace = 'redshells' train_data_task = gokart.TaskInstanceParameter( description= 'A task outputs a pd.DataFrame with columns={`user_column_name`, `item_column_name`, `service_column_name`, `target_column_name`}.' ) user_column_name = luigi.Parameter( default='user', description='The column name of user id.') # type: str item_column_name = luigi.Parameter( default='item', description='The column name of item id') # type: str service_column_name = luigi.Parameter( default='service', description='The column name of service id.') # type: str rating_column_name = luigi.Parameter( default='rating', description='The target column name to predict.') # type: str model_kwargs = luigi.DictParameter( default=dict(), description='Arguments of the model.') # type: Dict[str, Any] max_data_size = luigi.IntParameter(default=50000000) output_file_path = luigi.Parameter( default='model/matrix_factorization.zip') # type: str def requires(self): return self.train_data_task def output(self): return self.make_model_target(self.output_file_path, save_function=MatrixFactorization.save, load_function=MatrixFactorization.load) def run(self): tf.reset_default_graph() df = self.load_data_frame( required_columns={ self.user_column_name, self.item_column_name, self.service_column_name, self.rating_column_name }) df.drop_duplicates( subset=[self.user_column_name, self.item_column_name], inplace=True) df = sklearn.utils.shuffle(df) df = df.head(n=self.max_data_size) user_ids = df[self.user_column_name] item_ids = df[self.item_column_name] service_ids = df[self.service_column_name] ratings = df[self.rating_column_name] model = MatrixFactorization(**self.model_kwargs) model.fit(user_ids=user_ids, item_ids=item_ids, service_ids=service_ids, ratings=ratings) self.dump(model)