Пример #1
0
    def create_feature(self):
        # Check if the dataset id is train or test
        if is_test_or_val_set(self.dataset_id):
            train_dataset_id = get_train_set_id_from_test_or_val_set(
                self.dataset_id)
            test_dataset_id = self.dataset_id
        else:
            train_dataset_id = self.dataset_id
            test_dataset_id = get_test_or_val_set_id_from_train(
                train_dataset_id)

        import Utils.Data.Data as data
        train_df = data.get_dataset([
            f"mapped_feature_creator_id", f"mapped_feature_engager_id",
            f"tweet_feature_engagement_is_{self._get_suffix()}"
        ], train_dataset_id)
        if is_test_or_val_set(self.dataset_id):
            test_df = data.get_dataset(
                [f"mapped_feature_creator_id", f"mapped_feature_engager_id"],
                test_dataset_id)
            train_df = train_df[
                train_df[f"tweet_feature_engagement_is_{self._get_suffix()}"]
                == True]
            res = compute(train_df, test_df)
            res.sort_index(inplace=True)
            self._save_test_result(res, test_dataset_id)
        else:
            # Compute the folds
            X_train_folds = np.array_split(train_df.sample(frac=1),
                                           self.number_of_folds)

            result = None

            for i in range(self.number_of_folds):
                local_train = pd.concat([
                    X_train_folds[x] for x in range(self.number_of_folds)
                    if x is not i
                ])
                local_train = local_train[local_train[
                    f"tweet_feature_engagement_is_{self._get_suffix()}"] ==
                                          True]
                local_test = X_train_folds[i]

                res = compute(local_train, local_test)

                if result is None:
                    result = res
                else:
                    result = pd.concat([result, res])

            self._save_train_result_if_not_present(result, train_dataset_id)
    def create_feature(self):
        if is_test_or_val_set(self.dataset_id):

            train_dataset_id = get_train_set_id_from_test_or_val_set(self.dataset_id)

            engager_id_feature = MappedFeatureEngagerId(train_dataset_id)
            engagement_feature = TweetFeatureEngagementIsReply(train_dataset_id)

            engager_id_df = engager_id_feature.load_or_create()
            engagement_df = engagement_feature.load_or_create()

            # Load the media column
            dataframe = pd.concat([
                engager_id_df,
                engagement_df,
            ],
                axis=1
            )
            dataframe = dataframe[dataframe[engagement_feature.feature_name]]
            dataframe = pd.DataFrame({self.feature_name: dataframe.groupby(engager_id_feature.feature_name).size()})
            dictionary = dataframe.to_dict()[self.feature_name]

            test_engager_id_feature = MappedFeatureEngagerId(self.dataset_id)
            test_engager_id_df = test_engager_id_feature.load_or_create()

            engagement_count_df = pd.DataFrame(
                test_engager_id_df[engager_id_feature.feature_name].map(lambda x: dictionary.get(x, 0)))
            self.save_feature(engagement_count_df)
        else:

            engager_id_feature = MappedFeatureEngagerId(self.dataset_id)
            engagement_feature = TweetFeatureEngagementIsReply(self.dataset_id)

            engager_id_df = engager_id_feature.load_or_create()
            engagement_df = engagement_feature.load_or_create()

            # Load the media column
            dataframe = pd.concat([
                engager_id_df,
                engagement_df,
            ],
                axis=1
            )
            dataframe = dataframe[dataframe[engagement_feature.feature_name]]
            dataframe = pd.DataFrame({self.feature_name: dataframe.groupby(engager_id_feature.feature_name).size()})
            dictionary = dataframe.to_dict()[self.feature_name]

            engagement_count_df = pd.DataFrame(
                engager_id_df[engager_id_feature.feature_name].map(lambda x: dictionary.get(x, 0)))
            self.save_feature(engagement_count_df)
    def create_feature(self):
        kind = "comment"
        # Load the hashtags column
        feature = MappedFeatureTweetHashtags(self.dataset_id)
        feature_df = feature.load_or_create()
        # Load the list of discriminative for the like class
        if not is_test_or_val_set(self.dataset_id):
            kind_pos, kind_neg = self.loadDiscriminative(
                kind, self.dataset_id, feature_df, feature.feature_name, 3, 3)
        elif is_test_or_val_set(self.dataset_id):
            kind_pos, kind_neg = loadPosAndNegLists(kind)
        # Create the feature
        kind_disc_df = pd.DataFrame()
        kind_disc_df[self.feature_name +
                     "pos"] = feature_df[feature.feature_name].progress_map(
                         lambda x: containsHashtag(x, kind_pos)
                         if x is not None else False)
        kind_disc_df[self.feature_name +
                     "neg"] = feature_df[feature.feature_name].progress_map(
                         lambda x: containsHashtag(x, kind_neg)
                         if x is not None else False)
        kind_disc_df = kind_disc_df.astype(int)

        self.save_feature(kind_disc_df)
    def create_feature(self):
        # Check if the dataset id is train or test
        if is_test_or_val_set(self.dataset_id):
            train_dataset_id = get_train_set_id_from_test_or_val_set(
                self.dataset_id)
            test_dataset_id = self.dataset_id
        else:
            train_dataset_id = self.dataset_id
            test_dataset_id = get_test_or_val_set_id_from_train(
                train_dataset_id)

        # Load features
        creation_timestamps_feature = RawFeatureTweetTimestamp(
            train_dataset_id)
        engagers_feature = MappedFeatureEngagerId(train_dataset_id)

        # Save the column name
        eng_col = engagers_feature.feature_name

        dataframe = pd.concat([
            creation_timestamps_feature.load_or_create(),
            engagers_feature.load_or_create(),
        ],
                              axis=1)

        dataframe.sort_values(creation_timestamps_feature.feature_name,
                              inplace=True)

        engager_counter_array = np.zeros(
            dataframe[engagers_feature.feature_name].max() + 1, dtype=int)

        result = pd.DataFrame([
            find_and_increase(engager_id, engager_counter_array)
            for engager_id in dataframe[eng_col]
        ],
                              index=dataframe.index)

        if not EngagerFeatureNumberOfPreviousEngagement(
                train_dataset_id).has_feature():
            result.sort_index(inplace=True)
            EngagerFeatureNumberOfPreviousEngagement(
                train_dataset_id).save_feature(result)
        if not EngagerFeatureNumberOfPreviousEngagement(
                test_dataset_id).has_feature():
            # Load features
            creation_timestamps_feature = RawFeatureTweetTimestamp(
                test_dataset_id)
            engagers_feature = MappedFeatureEngagerId(test_dataset_id)

            dataframe = pd.concat([
                creation_timestamps_feature.load_or_create(),
                engagers_feature.load_or_create(),
            ],
                                  axis=1)

            if dataframe[engagers_feature.feature_name].max(
            ) + 1 > engager_counter_array.size:
                engager_counter_array = np.pad(
                    engager_counter_array,
                    pad_width=(0,
                               dataframe[engagers_feature.feature_name].max() +
                               1 - engager_counter_array.size),
                    mode='constant',
                    constant_values=0)

            result = pd.DataFrame(
                dataframe[eng_col].map(lambda x: engager_counter_array[x]),
                index=dataframe.index)
            EngagerFeatureNumberOfPreviousEngagement(
                test_dataset_id).save_feature(result)
Пример #5
0
    def create_feature(self):

        # Check if the dataset id is train or test
        if not is_test_or_val_set(self.dataset_id):
            # Compute train and test dataset ids
            train_dataset_id = self.dataset_id

            # Load the dataset and shuffle it
            import Utils.Data.Data as data
            X_train = data.get_dataset(features=self.features,
                                       dataset_id=train_dataset_id,
                                       nthread=64)

            print(X_train)
            print(X_train.memory_usage())

            Y_train = data.get_dataset(features=self.label,
                                       dataset_id=train_dataset_id,
                                       nthread=64)

            print(Y_train)
            print(Y_train.memory_usage())

            # Declare list of scores (of each folds)
            # used for aggregating results
            scores = []
            kf = KFold(n_splits=4, shuffle=True, random_state=8)
            # Train multiple models with 1-fold out strategy
            for train_index, test_index in kf.split(X_train):
                train_index = np.random.choice(train_index,
                                               int(len(train_index) / 20),
                                               replace=True)
                local_X_train = X_train.iloc[train_index]
                local_Y_train = Y_train.iloc[train_index]

                # Compute the test set
                local_X_test = X_train.iloc[test_index]

                # Generate the dataset id for this fold
                fold_dataset_id = f"{self.feature_name}_{self.dataset_id}_fold_{len(scores)}"

                # Create the sub-feature
                feature = XGBEnsembling(fold_dataset_id, local_X_train,
                                        local_Y_train, local_X_test,
                                        self.param_dict)

                # Retrieve the scores
                scores.append(
                    pd.DataFrame(feature.load_or_create(),
                                 index=local_X_test.index))
                print(scores)

            # Compute the resulting dataframe and sort the results
            result = pd.concat(scores).sort_index()

            # Save it as a feature
            self.save_feature(result)

        else:
            test_dataset_id = self.dataset_id
            train_dataset_id = get_train_set_id_from_test_or_val_set(
                test_dataset_id)
            # Load the train dataset
            import Utils.Data.Data as data
            X_train = data.get_dataset_batch(features=self.features,
                                             dataset_id=train_dataset_id,
                                             total_n_split=1,
                                             split_n=0,
                                             sample=0.05)
            Y_train = data.get_dataset_batch(features=self.label,
                                             dataset_id=train_dataset_id,
                                             total_n_split=1,
                                             split_n=0,
                                             sample=0.05)

            # Load the test dataset
            X_test = data.get_dataset(features=self.features,
                                      dataset_id=test_dataset_id,
                                      nthread=64)

            fold_dataset_id = f"{self.feature_name}_{self.dataset_id}"

            # Create the sub-feature
            feature = XGBEnsembling(fold_dataset_id, X_train, Y_train, X_test,
                                    self.param_dict)

            # Retrieve the scores
            result = pd.DataFrame(feature.load_or_create(), index=X_test.index)

            # Save it as a feature
            self.save_feature(result)
    def create_feature(self):
        # Check if the dataset id is train or test
        if is_test_or_val_set(self.dataset_id):
            train_dataset_id = get_train_set_id_from_test_or_val_set(
                self.dataset_id)
            test_dataset_id = get_test_or_val_set_id_from_train(
                train_dataset_id)
        else:
            train_dataset_id = self.dataset_id
            test_dataset_id = get_test_or_val_set_id_from_train(
                train_dataset_id)

        # Load features
        creation_timestamps_feature = RawFeatureTweetTimestamp(
            train_dataset_id)
        creators_feature = MappedFeatureCreatorId(train_dataset_id)
        tweet_id_feature = MappedFeatureTweetId(train_dataset_id)

        # Save the column name
        creators_col = creators_feature.feature_name
        tweet_id_col = tweet_id_feature.feature_name

        length_dict = TweetTokenLengthFeatureDictArray().load_or_create()
        length_unique_dict = TweetTokenLengthUniqueFeatureDictArray(
        ).load_or_create()

        dataframe = pd.concat([
            creators_feature.load_or_create(),
            creation_timestamps_feature.load_or_create(),
            tweet_id_feature.load_or_create()
        ],
                              axis=1)

        dataframe.sort_values(creation_timestamps_feature.feature_name,
                              inplace=True)

        creator_length_array = np.zeros(
            dataframe[creators_feature.feature_name].max() + 1, dtype=int)
        creator_length_unique_array = np.zeros(
            dataframe[creators_feature.feature_name].max() + 1, dtype=int)

        result = pd.DataFrame([
            find_ratio_and_update(
                creator_id, creator_length_array, creator_length_unique_array,
                length_dict[tweet_id], length_unique_dict[tweet_id])
            for creator_id, tweet_id in zip(dataframe[creators_col],
                                            dataframe[tweet_id_col])
        ],
                              index=dataframe.index)

        if not CreatorFrequencyUniqueTokens(train_dataset_id).has_feature():
            result.sort_index(inplace=True)
            CreatorFrequencyUniqueTokens(train_dataset_id).save_feature(result)

        if not CreatorFrequencyUniqueTokens(test_dataset_id).has_feature():

            # Load features
            creation_timestamps_feature = RawFeatureTweetTimestamp(
                test_dataset_id)
            creators_feature = MappedFeatureCreatorId(test_dataset_id)
            tweet_id_feature = MappedFeatureTweetId(test_dataset_id)

            # Save the column name
            creators_col = creators_feature.feature_name
            tweet_id_col = tweet_id_feature.feature_name

            dataframe = pd.concat([
                creation_timestamps_feature.load_or_create(),
                creators_feature.load_or_create(),
                tweet_id_feature.load_or_create(),
            ],
                                  axis=1)
            dataframe.sort_values(creation_timestamps_feature.feature_name,
                                  inplace=True)

            # if there are new creators in the test set, pad the arrays
            if dataframe[creators_col].max() + 1 > creator_length_array.size:
                creator_length_array = np.pad(
                    creator_length_array,
                    pad_width=(0, dataframe[creators_col].max() + 1 -
                               creator_length_array.size),
                    mode='constant',
                    constant_values=0)

                creator_length_unique_array = np.pad(
                    creator_length_array,
                    pad_width=(0, dataframe[creators_col].max() + 1 -
                               creator_length_unique_array.size),
                    mode='constant',
                    constant_values=0)

            result = pd.DataFrame([
                find_ratio_and_update(
                    creator_id, creator_length_array,
                    creator_length_unique_array, length_dict[tweet_id],
                    length_unique_dict[tweet_id]) for creator_id, tweet_id in
                zip(dataframe[creators_col], dataframe[tweet_id_col])
            ],
                                  index=dataframe.index)

            result.sort_index(inplace=True)

            CreatorFrequencyUniqueTokens(test_dataset_id).save_feature(result)
Пример #7
0
    def create_feature(self):
        # Check if the dataset id is train or test
        if is_test_or_val_set(self.dataset_id):
            train_dataset_id = get_train_set_id_from_test_or_val_set(
                self.dataset_id)
            test_dataset_id = self.dataset_id
        else:
            train_dataset_id = self.dataset_id
            test_dataset_id = get_test_or_val_set_id_from_train(
                train_dataset_id)

        start_time = time.time()

        # Load features
        creation_timestamps_feature = RawFeatureTweetTimestamp(
            train_dataset_id)
        creator_id_feature = MappedFeatureCreatorId(train_dataset_id)
        engagement_feature = self._get_engagement_feature(train_dataset_id)

        # save column names
        creator_id_col = creator_id_feature.feature_name
        engagement_col = engagement_feature.feature_name

        dataframe = pd.concat([
            creation_timestamps_feature.load_or_create(),
            creator_id_feature.load_or_create(),
            engagement_feature.load_or_create()
        ],
                              axis=1)

        dataframe.sort_values(creation_timestamps_feature.feature_name,
                              inplace=True)

        creator_counter_array = np.zeros(dataframe[creator_id_col].max() + 1,
                                         dtype=int)

        result = pd.DataFrame([
            find_and_increase(creator_id=creator_id,
                              counter_array=creator_counter_array)
            if engagement else creator_counter_array[creator_id]
            for creator_id, engagement in zip(dataframe[creator_id_col],
                                              dataframe[engagement_col])
        ],
                              index=dataframe.index)
        self._save_train_result_if_not_present(result, train_dataset_id)

        if not self._exists_test_feature(test_dataset_id):
            # Load features
            # Load features
            creation_timestamps_feature = RawFeatureTweetTimestamp(
                test_dataset_id)
            creator_id_feature = MappedFeatureCreatorId(test_dataset_id)

            # save column names
            creator_id_col = creator_id_feature.feature_name

            dataframe = pd.concat([
                creator_id_feature.load_or_create(),
                creation_timestamps_feature.load_or_create(),
            ],
                                  axis=1)

            dataframe.sort_values(creation_timestamps_feature.feature_name,
                                  inplace=True)

            if dataframe[creator_id_col].max(
            ) + 1 > creator_counter_array.size:
                creator_counter_array = np.pad(
                    creator_counter_array,
                    pad_width=(0, dataframe[creator_id_col].max() + 1 -
                               creator_counter_array.size),
                    mode='constant',
                    constant_values=0)

            result = pd.DataFrame(dataframe[creator_id_col].map(
                lambda x: creator_counter_array[x]),
                                  index=dataframe.index)

            result.sort_index(inplace=True)

            print("time:")
            print(time.time() - start_time)

            self._save_test_result(result, test_dataset_id)
    def create_feature(self):
        # Check if the dataset id is train or test
        if is_test_or_val_set(self.dataset_id):
            train_dataset_id = get_train_set_id_from_test_or_val_set(
                self.dataset_id)
            test_dataset_id = self.dataset_id
        else:
            train_dataset_id = self.dataset_id
            test_dataset_id = get_test_or_val_set_id_from_train(
                train_dataset_id)

        # Load features
        creation_timestamps_feature = RawFeatureTweetTimestamp(
            train_dataset_id)
        engagers_feature = MappedFeatureEngagerId(train_dataset_id)
        creators_feature = MappedFeatureCreatorId(train_dataset_id)
        language_feature = MappedFeatureTweetLanguage(train_dataset_id)
        engagement_feature = TweetFeatureEngagementIsNegative(train_dataset_id)

        dataframe = pd.concat([
            creation_timestamps_feature.load_or_create(),
            engagers_feature.load_or_create(),
            engagement_feature.load_or_create(),
            creators_feature.load_or_create(),
            language_feature.load_or_create()
        ],
                              axis=1)

        dataframe.sort_values(creation_timestamps_feature.feature_name,
                              inplace=True)

        # KEY: a tuple (creator, engager)
        # VALUE: the number of time the engager has engaged with the creator
        # If key does not exists -> 0 times.
        engagement_dict = {}

        result = pd.DataFrame([
            find_and_increase_engager(eng_id, cre_id, lang, engagement_dict)
            if engagement else engagement_dict.get((eng_id, lang), 0)
            for eng_id, cre_id, lang, engagement in zip(
                dataframe[engagers_feature.feature_name], dataframe[
                    creators_feature.feature_name], dataframe[
                        language_feature.feature_name], dataframe[
                            engagement_feature.feature_name])
        ],
                              index=dataframe.index)

        if not EngagerFeatureNumberOfPreviousNegativeEngagementWithLanguage(
                train_dataset_id).has_feature():
            result.sort_index(inplace=True)
            EngagerFeatureNumberOfPreviousNegativeEngagementWithLanguage(
                train_dataset_id).save_feature(result)
        if not EngagerFeatureNumberOfPreviousNegativeEngagementWithLanguage(
                test_dataset_id).has_feature():
            # Load features
            creation_timestamps_feature = RawFeatureTweetTimestamp(
                test_dataset_id)
            engagers_feature = MappedFeatureEngagerId(test_dataset_id)
            language_feature = MappedFeatureTweetLanguage(test_dataset_id)
            creators_feature = MappedFeatureCreatorId(test_dataset_id)

            dataframe = pd.concat([
                creation_timestamps_feature.load_or_create(),
                engagers_feature.load_or_create(),
                creators_feature.load_or_create(),
                language_feature.load_or_create()
            ],
                                  axis=1)

            dataframe.sort_values(creation_timestamps_feature.feature_name,
                                  inplace=True)

            result = pd.DataFrame([
                find_and_increase_creator(eng_id, cre_id, lang,
                                          engagement_dict)
                for eng_id, cre_id, lang in zip(
                    dataframe[engagers_feature.feature_name], dataframe[
                        creators_feature.feature_name], dataframe[
                            language_feature.feature_name])
            ],
                                  index=dataframe.index)
            result.sort_index(inplace=True)

            EngagerFeatureNumberOfPreviousNegativeEngagementWithLanguage(
                test_dataset_id).save_feature(result)
Пример #9
0
    def create_feature(self):
        if is_test_or_val_set(self.dataset_id):

            train_dataset_id = get_train_set_id_from_test_or_val_set(
                self.dataset_id)

            # Load the necessary features
            creator_id_feature = MappedFeatureCreatorId(train_dataset_id)
            engager_id_feature = MappedFeatureEngagerId(train_dataset_id)
            language_id_feature = MappedFeatureTweetLanguage(train_dataset_id)
            engagement_feature = TweetFeatureEngagementIsLike(train_dataset_id)

            # Load the dataframes
            creator_id_df = creator_id_feature.load_or_create()
            engager_id_df = engager_id_feature.load_or_create()
            language_id_df = language_id_feature.load_or_create()
            engagement_df = engagement_feature.load_or_create()

            # Concatenate the dataframes
            dataframe = pd.concat(
                [creator_id_df, engager_id_df, language_id_df, engagement_df],
                axis=1)

            # Filter the negative interactions
            positive_dataframe = dataframe[dataframe[
                engagement_feature.feature_name]]

            # Let's compute the known language when the user is creator
            dictionary_creator_df = pd.DataFrame(positive_dataframe[[
                creator_id_feature.feature_name,
                language_id_feature.feature_name,
                engagement_feature.feature_name
            ]].groupby([
                creator_id_feature.feature_name,
                language_id_feature.feature_name
            ]).first())

            dictionary_creator_df.columns = ['users']

            dictionary_creator = dictionary_creator_df.to_dict()['users']

            # Let's compute the known language when the user is engager
            dictionary_engager_df = pd.DataFrame(positive_dataframe[[
                engager_id_feature.feature_name,
                language_id_feature.feature_name,
                engagement_feature.feature_name
            ]].groupby([
                engager_id_feature.feature_name,
                language_id_feature.feature_name
            ]).first())

            dictionary_engager_df.columns = ['users']

            dictionary_engager = dictionary_engager_df.to_dict()['users']

            # Merge the two dictionaries
            dictionary_user = {**dictionary_creator, **dictionary_engager}

            # Load the test information
            test_engager_id_feature = MappedFeatureEngagerId(self.dataset_id)
            test_tweet_langugage_feature = MappedFeatureTweetLanguage(
                self.dataset_id)
            test_engager_id_df = test_engager_id_feature.load_or_create()
            test_tweet_langugage_df = test_tweet_langugage_feature.load_or_create(
            )

            test_dataframe = pd.concat(
                [test_engager_id_df, test_tweet_langugage_df], axis=1)

            # Apply the super duper dictionary
            result_df = pd.DataFrame(test_dataframe[[
                engager_id_feature.feature_name,
                language_id_feature.feature_name
            ]].apply(lambda x: dictionary_user.get((x[0], x[1]), False),
                     axis=1))

            # Save back the dataframe
            self.save_feature(result_df)
        else:

            # Load the necessary features
            creator_id_feature = MappedFeatureCreatorId(self.dataset_id)
            engager_id_feature = MappedFeatureEngagerId(self.dataset_id)
            language_id_feature = MappedFeatureTweetLanguage(self.dataset_id)
            engagement_feature = TweetFeatureEngagementIsLike(self.dataset_id)

            # Load the dataframes
            creator_id_df = creator_id_feature.load_or_create()
            engager_id_df = engager_id_feature.load_or_create()
            language_id_df = language_id_feature.load_or_create()
            engagement_df = engagement_feature.load_or_create()

            # Concatenate the dataframes
            dataframe = pd.concat(
                [creator_id_df, engager_id_df, language_id_df, engagement_df],
                axis=1)

            # Filter the negative interactions
            positive_dataframe = dataframe[dataframe[
                engagement_feature.feature_name]]

            # Let's compute the known language when the user is creator
            dictionary_creator_df = pd.DataFrame(positive_dataframe[[
                creator_id_feature.feature_name,
                language_id_feature.feature_name,
                engagement_feature.feature_name
            ]].groupby([
                creator_id_feature.feature_name,
                language_id_feature.feature_name
            ]).first())

            dictionary_creator_df.columns = ['users']

            dictionary_creator = dictionary_creator_df.to_dict()['users']

            # Let's compute the known language when the user is engager
            dictionary_engager_df = pd.DataFrame(positive_dataframe[[
                engager_id_feature.feature_name,
                language_id_feature.feature_name,
                engagement_feature.feature_name
            ]].groupby([
                engager_id_feature.feature_name,
                language_id_feature.feature_name
            ]).first())

            dictionary_engager_df.columns = ['users']

            dictionary_engager = dictionary_engager_df.to_dict()['users']

            # Merge the two dictionaries
            dictionary_user = {**dictionary_creator, **dictionary_engager}

            # Apply the super duper dictionary
            result_df = pd.DataFrame(dataframe[[
                engager_id_feature.feature_name,
                language_id_feature.feature_name
            ]].apply(lambda x: dictionary_user.get((x[0], x[1]), False),
                     axis=1))

            # Save back the dataframe
            self.save_feature(result_df)
    def create_feature(self):

        # Check if the dataset id is train or test
        if is_test_or_val_set(self.dataset_id):
            train_dataset_id = get_train_set_id_from_test_or_val_set(
                self.dataset_id)
            test_dataset_id = self.dataset_id
        else:
            train_dataset_id = self.dataset_id
            test_dataset_id = get_test_or_val_set_id_from_train(
                train_dataset_id)

        # Load features
        creation_timestamps_feature = RawFeatureTweetTimestamp(
            train_dataset_id)
        creators_feature = MappedFeatureCreatorId(train_dataset_id)
        engagers_feature = MappedFeatureEngagerId(train_dataset_id)
        language_feature = MappedFeatureGroupedTweetLanguage(train_dataset_id)
        engagement_feature = TweetFeatureEngagementIsPositive(train_dataset_id)

        dataframe = pd.concat([
            creation_timestamps_feature.load_or_create(),
            creators_feature.load_or_create(),
            engagers_feature.load_or_create(),
            language_feature.load_or_create(),
            engagement_feature.load_or_create()
        ],
                              axis=1)

        dataframe.sort_values(creation_timestamps_feature.feature_name,
                              inplace=True)

        engager_counter_array = np.zeros(
            (data.DataStats.get_max_user_id() + 1, 70), dtype=np.uint16)

        result = pd.DataFrame([
            find_and_increase_engager(engager_id, creator_id, language,
                                      engagement, engager_counter_array)
            for engager_id, creator_id, language, engagement in zip(
                dataframe[engagers_feature.feature_name], dataframe[
                    creators_feature.feature_name], dataframe[
                        language_feature.feature_name], dataframe[
                            engagement_feature.feature_name])
        ],
                              index=dataframe.index)
        if not EngagerMainGroupedLanguage(train_dataset_id).has_feature():
            result.sort_index(inplace=True)
            EngagerMainGroupedLanguage(train_dataset_id).save_feature(result)
        if not EngagerMainGroupedLanguage(test_dataset_id).has_feature():
            # Load features
            creation_timestamps_feature = RawFeatureTweetTimestamp(
                test_dataset_id)
            creators_feature = MappedFeatureCreatorId(test_dataset_id)
            engagers_feature = MappedFeatureEngagerId(test_dataset_id)
            language_feature = MappedFeatureGroupedTweetLanguage(
                test_dataset_id)

            dataframe = pd.concat([
                creation_timestamps_feature.load_or_create(),
                creators_feature.load_or_create(),
                engagers_feature.load_or_create(),
                language_feature.load_or_create()
            ],
                                  axis=1)

            dataframe.sort_values(creation_timestamps_feature.feature_name,
                                  inplace=True)

            result = pd.DataFrame([
                find_and_increase_engager(engager_id, creator_id, language,
                                          False, engager_counter_array)
                for engager_id, creator_id, language in zip(
                    dataframe[engagers_feature.feature_name], dataframe[
                        creators_feature.feature_name], dataframe[
                            language_feature.feature_name])
            ],
                                  index=dataframe.index)

            result.sort_index(inplace=True)

            EngagerMainGroupedLanguage(test_dataset_id).save_feature(result)