def create_feature(self): # Check if the dataset id is train or test if is_test_or_val_set(self.dataset_id): train_dataset_id = get_train_set_id_from_test_or_val_set( self.dataset_id) test_dataset_id = self.dataset_id else: train_dataset_id = self.dataset_id test_dataset_id = get_test_or_val_set_id_from_train( train_dataset_id) import Utils.Data.Data as data train_df = data.get_dataset([ f"mapped_feature_creator_id", f"mapped_feature_engager_id", f"tweet_feature_engagement_is_{self._get_suffix()}" ], train_dataset_id) if is_test_or_val_set(self.dataset_id): test_df = data.get_dataset( [f"mapped_feature_creator_id", f"mapped_feature_engager_id"], test_dataset_id) train_df = train_df[ train_df[f"tweet_feature_engagement_is_{self._get_suffix()}"] == True] res = compute(train_df, test_df) res.sort_index(inplace=True) self._save_test_result(res, test_dataset_id) else: # Compute the folds X_train_folds = np.array_split(train_df.sample(frac=1), self.number_of_folds) result = None for i in range(self.number_of_folds): local_train = pd.concat([ X_train_folds[x] for x in range(self.number_of_folds) if x is not i ]) local_train = local_train[local_train[ f"tweet_feature_engagement_is_{self._get_suffix()}"] == True] local_test = X_train_folds[i] res = compute(local_train, local_test) if result is None: result = res else: result = pd.concat([result, res]) self._save_train_result_if_not_present(result, train_dataset_id)
def create_feature(self): # Check if the dataset id is train or test if is_test_or_val_set(self.dataset_id): train_dataset_id = get_train_set_id_from_test_or_val_set( self.dataset_id) test_dataset_id = self.dataset_id else: train_dataset_id = self.dataset_id test_dataset_id = get_test_or_val_set_id_from_train( train_dataset_id) # Load features creation_timestamps_feature = RawFeatureTweetTimestamp( train_dataset_id) engagers_feature = MappedFeatureEngagerId(train_dataset_id) # Save the column name eng_col = engagers_feature.feature_name dataframe = pd.concat([ creation_timestamps_feature.load_or_create(), engagers_feature.load_or_create(), ], axis=1) dataframe.sort_values(creation_timestamps_feature.feature_name, inplace=True) engager_counter_array = np.zeros( dataframe[engagers_feature.feature_name].max() + 1, dtype=int) result = pd.DataFrame([ find_and_increase(engager_id, engager_counter_array) for engager_id in dataframe[eng_col] ], index=dataframe.index) if not EngagerFeatureNumberOfPreviousEngagement( train_dataset_id).has_feature(): result.sort_index(inplace=True) EngagerFeatureNumberOfPreviousEngagement( train_dataset_id).save_feature(result) if not EngagerFeatureNumberOfPreviousEngagement( test_dataset_id).has_feature(): # Load features creation_timestamps_feature = RawFeatureTweetTimestamp( test_dataset_id) engagers_feature = MappedFeatureEngagerId(test_dataset_id) dataframe = pd.concat([ creation_timestamps_feature.load_or_create(), engagers_feature.load_or_create(), ], axis=1) if dataframe[engagers_feature.feature_name].max( ) + 1 > engager_counter_array.size: engager_counter_array = np.pad( engager_counter_array, pad_width=(0, dataframe[engagers_feature.feature_name].max() + 1 - engager_counter_array.size), mode='constant', constant_values=0) result = pd.DataFrame( dataframe[eng_col].map(lambda x: engager_counter_array[x]), index=dataframe.index) EngagerFeatureNumberOfPreviousEngagement( test_dataset_id).save_feature(result)
def create_feature(self): # Check if the dataset id is train or test if is_test_or_val_set(self.dataset_id): train_dataset_id = get_train_set_id_from_test_or_val_set( self.dataset_id) test_dataset_id = get_test_or_val_set_id_from_train( train_dataset_id) else: train_dataset_id = self.dataset_id test_dataset_id = get_test_or_val_set_id_from_train( train_dataset_id) # Load features creation_timestamps_feature = RawFeatureTweetTimestamp( train_dataset_id) creators_feature = MappedFeatureCreatorId(train_dataset_id) tweet_id_feature = MappedFeatureTweetId(train_dataset_id) # Save the column name creators_col = creators_feature.feature_name tweet_id_col = tweet_id_feature.feature_name length_dict = TweetTokenLengthFeatureDictArray().load_or_create() length_unique_dict = TweetTokenLengthUniqueFeatureDictArray( ).load_or_create() dataframe = pd.concat([ creators_feature.load_or_create(), creation_timestamps_feature.load_or_create(), tweet_id_feature.load_or_create() ], axis=1) dataframe.sort_values(creation_timestamps_feature.feature_name, inplace=True) creator_length_array = np.zeros( dataframe[creators_feature.feature_name].max() + 1, dtype=int) creator_length_unique_array = np.zeros( dataframe[creators_feature.feature_name].max() + 1, dtype=int) result = pd.DataFrame([ find_ratio_and_update( creator_id, creator_length_array, creator_length_unique_array, length_dict[tweet_id], length_unique_dict[tweet_id]) for creator_id, tweet_id in zip(dataframe[creators_col], dataframe[tweet_id_col]) ], index=dataframe.index) if not CreatorFrequencyUniqueTokens(train_dataset_id).has_feature(): result.sort_index(inplace=True) CreatorFrequencyUniqueTokens(train_dataset_id).save_feature(result) if not CreatorFrequencyUniqueTokens(test_dataset_id).has_feature(): # Load features creation_timestamps_feature = RawFeatureTweetTimestamp( test_dataset_id) creators_feature = MappedFeatureCreatorId(test_dataset_id) tweet_id_feature = MappedFeatureTweetId(test_dataset_id) # Save the column name creators_col = creators_feature.feature_name tweet_id_col = tweet_id_feature.feature_name dataframe = pd.concat([ creation_timestamps_feature.load_or_create(), creators_feature.load_or_create(), tweet_id_feature.load_or_create(), ], axis=1) dataframe.sort_values(creation_timestamps_feature.feature_name, inplace=True) # if there are new creators in the test set, pad the arrays if dataframe[creators_col].max() + 1 > creator_length_array.size: creator_length_array = np.pad( creator_length_array, pad_width=(0, dataframe[creators_col].max() + 1 - creator_length_array.size), mode='constant', constant_values=0) creator_length_unique_array = np.pad( creator_length_array, pad_width=(0, dataframe[creators_col].max() + 1 - creator_length_unique_array.size), mode='constant', constant_values=0) result = pd.DataFrame([ find_ratio_and_update( creator_id, creator_length_array, creator_length_unique_array, length_dict[tweet_id], length_unique_dict[tweet_id]) for creator_id, tweet_id in zip(dataframe[creators_col], dataframe[tweet_id_col]) ], index=dataframe.index) result.sort_index(inplace=True) CreatorFrequencyUniqueTokens(test_dataset_id).save_feature(result)
def create_feature(self): # Check if the dataset id is train or test if is_test_or_val_set(self.dataset_id): train_dataset_id = get_train_set_id_from_test_or_val_set( self.dataset_id) test_dataset_id = self.dataset_id else: train_dataset_id = self.dataset_id test_dataset_id = get_test_or_val_set_id_from_train( train_dataset_id) start_time = time.time() # Load features creation_timestamps_feature = RawFeatureTweetTimestamp( train_dataset_id) creator_id_feature = MappedFeatureCreatorId(train_dataset_id) engagement_feature = self._get_engagement_feature(train_dataset_id) # save column names creator_id_col = creator_id_feature.feature_name engagement_col = engagement_feature.feature_name dataframe = pd.concat([ creation_timestamps_feature.load_or_create(), creator_id_feature.load_or_create(), engagement_feature.load_or_create() ], axis=1) dataframe.sort_values(creation_timestamps_feature.feature_name, inplace=True) creator_counter_array = np.zeros(dataframe[creator_id_col].max() + 1, dtype=int) result = pd.DataFrame([ find_and_increase(creator_id=creator_id, counter_array=creator_counter_array) if engagement else creator_counter_array[creator_id] for creator_id, engagement in zip(dataframe[creator_id_col], dataframe[engagement_col]) ], index=dataframe.index) self._save_train_result_if_not_present(result, train_dataset_id) if not self._exists_test_feature(test_dataset_id): # Load features # Load features creation_timestamps_feature = RawFeatureTweetTimestamp( test_dataset_id) creator_id_feature = MappedFeatureCreatorId(test_dataset_id) # save column names creator_id_col = creator_id_feature.feature_name dataframe = pd.concat([ creator_id_feature.load_or_create(), creation_timestamps_feature.load_or_create(), ], axis=1) dataframe.sort_values(creation_timestamps_feature.feature_name, inplace=True) if dataframe[creator_id_col].max( ) + 1 > creator_counter_array.size: creator_counter_array = np.pad( creator_counter_array, pad_width=(0, dataframe[creator_id_col].max() + 1 - creator_counter_array.size), mode='constant', constant_values=0) result = pd.DataFrame(dataframe[creator_id_col].map( lambda x: creator_counter_array[x]), index=dataframe.index) result.sort_index(inplace=True) print("time:") print(time.time() - start_time) self._save_test_result(result, test_dataset_id)
def analyze_cold_creator(dataset_id: str): train_dataset = dataset_id test_dataset = get_test_or_val_set_id_from_train(dataset_id) is_positive = TweetFeatureEngagementIsPositive(train_dataset) is_positive_df = is_positive.load_or_create() positive_mask = is_positive_df[is_positive.feature_name] train_creator_id = MappedFeatureCreatorId(train_dataset) train_engager_id = MappedFeatureEngagerId(train_dataset) train_creator_id_df = train_creator_id.load_or_create()[positive_mask] train_engager_id_df = train_engager_id.load_or_create()[positive_mask] train_users_id_df = pd.DataFrame( pd.concat([ train_creator_id_df[train_creator_id.feature_name], train_engager_id_df[train_engager_id.feature_name] ], axis=0).unique()) train_users_id_df['is_train'] = True train_users_id_df.set_index(train_users_id_df.columns[0], inplace=True) test_creator_id = MappedFeatureCreatorId(test_dataset) # test_engager_id = MappedFeatureEngagerId(test_dataset) test_creator_id_df = test_creator_id.load_or_create() # test_engager_id_df = test_engager_id.load_or_create() total_number_of_engagements = len(test_creator_id_df) test_users_id_df = pd.DataFrame( pd.concat( [ test_creator_id_df[test_creator_id.feature_name], # test_engager_id_df[test_engager_id.feature_name] ], axis=0).unique()) count = pd.DataFrame({ 'count': test_users_id_df.groupby(test_users_id_df.columns[0]).size() }) test_users_id_df['is_test'] = True test_users_id_df.set_index(test_users_id_df.columns[0], inplace=True) x = train_users_id_df.join(test_users_id_df, how='outer') x = x.fillna(False) train_mask = x['is_train'] == False test_mask = x['is_test'] == True mask = train_mask & test_mask cold_users = np.array(x[mask].index.array) print(f"------------------------") print(dataset_id) print(f"Unique test creators are: {len(test_users_id_df)}") print(f"Unique test cold creators are: {len(cold_users)}") print(f"Engagements in test set are: {total_number_of_engagements}") print( f"Engagements of cold users in test set are: {count['count'][mask].sum()}" ) print( f"Man number of engagement per cold creator in test set are: {count['count'][mask].max()}" ) print( f"Probability that an engagement is engaged by a cold creators: {len(cold_users)/total_number_of_engagements}" ) print(f"------------------------")
def create_feature(self): # Check if the dataset id is train or test if is_test_or_val_set(self.dataset_id): train_dataset_id = get_train_set_id_from_test_or_val_set( self.dataset_id) test_dataset_id = self.dataset_id else: train_dataset_id = self.dataset_id test_dataset_id = get_test_or_val_set_id_from_train( train_dataset_id) # Load features creation_timestamps_feature = RawFeatureTweetTimestamp( train_dataset_id) engagers_feature = MappedFeatureEngagerId(train_dataset_id) creators_feature = MappedFeatureCreatorId(train_dataset_id) language_feature = MappedFeatureTweetLanguage(train_dataset_id) engagement_feature = TweetFeatureEngagementIsNegative(train_dataset_id) dataframe = pd.concat([ creation_timestamps_feature.load_or_create(), engagers_feature.load_or_create(), engagement_feature.load_or_create(), creators_feature.load_or_create(), language_feature.load_or_create() ], axis=1) dataframe.sort_values(creation_timestamps_feature.feature_name, inplace=True) # KEY: a tuple (creator, engager) # VALUE: the number of time the engager has engaged with the creator # If key does not exists -> 0 times. engagement_dict = {} result = pd.DataFrame([ find_and_increase_engager(eng_id, cre_id, lang, engagement_dict) if engagement else engagement_dict.get((eng_id, lang), 0) for eng_id, cre_id, lang, engagement in zip( dataframe[engagers_feature.feature_name], dataframe[ creators_feature.feature_name], dataframe[ language_feature.feature_name], dataframe[ engagement_feature.feature_name]) ], index=dataframe.index) if not EngagerFeatureNumberOfPreviousNegativeEngagementWithLanguage( train_dataset_id).has_feature(): result.sort_index(inplace=True) EngagerFeatureNumberOfPreviousNegativeEngagementWithLanguage( train_dataset_id).save_feature(result) if not EngagerFeatureNumberOfPreviousNegativeEngagementWithLanguage( test_dataset_id).has_feature(): # Load features creation_timestamps_feature = RawFeatureTweetTimestamp( test_dataset_id) engagers_feature = MappedFeatureEngagerId(test_dataset_id) language_feature = MappedFeatureTweetLanguage(test_dataset_id) creators_feature = MappedFeatureCreatorId(test_dataset_id) dataframe = pd.concat([ creation_timestamps_feature.load_or_create(), engagers_feature.load_or_create(), creators_feature.load_or_create(), language_feature.load_or_create() ], axis=1) dataframe.sort_values(creation_timestamps_feature.feature_name, inplace=True) result = pd.DataFrame([ find_and_increase_creator(eng_id, cre_id, lang, engagement_dict) for eng_id, cre_id, lang in zip( dataframe[engagers_feature.feature_name], dataframe[ creators_feature.feature_name], dataframe[ language_feature.feature_name]) ], index=dataframe.index) result.sort_index(inplace=True) EngagerFeatureNumberOfPreviousNegativeEngagementWithLanguage( test_dataset_id).save_feature(result)
def create_feature(self): # Check if the dataset id is train or test if is_test_or_val_set(self.dataset_id): train_dataset_id = get_train_set_id_from_test_or_val_set( self.dataset_id) test_dataset_id = self.dataset_id else: train_dataset_id = self.dataset_id test_dataset_id = get_test_or_val_set_id_from_train( train_dataset_id) # Load features creation_timestamps_feature = RawFeatureTweetTimestamp( train_dataset_id) creators_feature = MappedFeatureCreatorId(train_dataset_id) engagers_feature = MappedFeatureEngagerId(train_dataset_id) language_feature = MappedFeatureGroupedTweetLanguage(train_dataset_id) engagement_feature = TweetFeatureEngagementIsPositive(train_dataset_id) dataframe = pd.concat([ creation_timestamps_feature.load_or_create(), creators_feature.load_or_create(), engagers_feature.load_or_create(), language_feature.load_or_create(), engagement_feature.load_or_create() ], axis=1) dataframe.sort_values(creation_timestamps_feature.feature_name, inplace=True) engager_counter_array = np.zeros( (data.DataStats.get_max_user_id() + 1, 70), dtype=np.uint16) result = pd.DataFrame([ find_and_increase_engager(engager_id, creator_id, language, engagement, engager_counter_array) for engager_id, creator_id, language, engagement in zip( dataframe[engagers_feature.feature_name], dataframe[ creators_feature.feature_name], dataframe[ language_feature.feature_name], dataframe[ engagement_feature.feature_name]) ], index=dataframe.index) if not EngagerMainGroupedLanguage(train_dataset_id).has_feature(): result.sort_index(inplace=True) EngagerMainGroupedLanguage(train_dataset_id).save_feature(result) if not EngagerMainGroupedLanguage(test_dataset_id).has_feature(): # Load features creation_timestamps_feature = RawFeatureTweetTimestamp( test_dataset_id) creators_feature = MappedFeatureCreatorId(test_dataset_id) engagers_feature = MappedFeatureEngagerId(test_dataset_id) language_feature = MappedFeatureGroupedTweetLanguage( test_dataset_id) dataframe = pd.concat([ creation_timestamps_feature.load_or_create(), creators_feature.load_or_create(), engagers_feature.load_or_create(), language_feature.load_or_create() ], axis=1) dataframe.sort_values(creation_timestamps_feature.feature_name, inplace=True) result = pd.DataFrame([ find_and_increase_engager(engager_id, creator_id, language, False, engager_counter_array) for engager_id, creator_id, language in zip( dataframe[engagers_feature.feature_name], dataframe[ creators_feature.feature_name], dataframe[ language_feature.feature_name]) ], index=dataframe.index) result.sort_index(inplace=True) EngagerMainGroupedLanguage(test_dataset_id).save_feature(result)