def __training(self, train_seasons: Set[int]) -> Tuple[gaussian_kde, gaussian_kde]: """ Execute the training phase of the Age Layer. Arguments: train_seasons (Set[int]): The season used as training data Returns: The kernel density estimator for respectively the Mol data and non-Mol data. """ players = [player for player in Player if get_season(player) in train_seasons] train_input = [float(get_age(player)) for player in players] train_output = [1.0 if get_is_mol(player) else 0.0 for player in players] non_mol = np.array([data for data, label in zip(train_input, train_output) if label == 0.0]) mol = np.array([data for data, label in zip(train_input, train_output) if label == 1.0]) non_mol_kde = InnerAppearanceLayer.kernel_density_estimation(non_mol) mol_kde = InnerAppearanceLayer.kernel_density_estimation(mol) return non_mol_kde, mol_kde
def seasons_with_data() -> Set[int]: """ Get all seasons that have Wikipedia data. """ return {get_season(player) for player in LINKER}
# Count all words for these seasons from collections import Counter from Data.Player import Player from Data.PlayerData import get_season from Layers.Wikipedia.WikipediaParser import WikipediaParser from nltk.corpus import stopwords SEASONS = {9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20} pure_counter = Counter() for player in Player: if get_season(player) in SEASONS: pure_counter += WikipediaParser.wiki_file_parse(player) compound_counter = Counter() dictionary = WikipediaParser.get_standard_dictionary() stop_words = set(stopwords.words('dutch')) for word, count in pure_counter.items(): sub_words = WikipediaParser.get_all_sub_words(word, dictionary, WikipediaParser.MIN_LENGTH_COMPOUND_WORD) sub_words.difference_update(stop_words) for sub_word in sub_words: compound_counter[sub_word] += count print(compound_counter)
mol_points: int) -> float: u = (non_mol_kde.pdf(x)[0] - mol_kde.pdf(x)[0])**4 r = 1 / (2 * math.sqrt(math.pi)) non_mol_term = non_mol_kde.pdf(x)[0] / (non_mol_points * math.sqrt(non_mol_bandwidth)) mol_term = mol_kde.pdf(x)[0] / (mol_points * math.sqrt(mol_bandwidth)) sigma = (r * (non_mol_term + mol_term))**2 return u / sigma TEST_SEASONS = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 } ALPHA = 0.05 players = [player for player in Player if get_season(player) in TEST_SEASONS] train_input = [float(get_age(player)) for player in players] train_output = [1.0 if get_is_mol(player) else 0.0 for player in players] non_mol = np.array( [data for data, label in zip(train_input, train_output) if label == 0.0]) mol = np.array( [data for data, label in zip(train_input, train_output) if label == 1.0]) non_mol_kde = InnerAppearanceLayer.kernel_density_estimation(non_mol) non_mol_bandwidth = silverman_bandwidth(np.array(non_mol)) non_mol_points = len(non_mol) mol_kde = InnerAppearanceLayer.kernel_density_estimation(mol) mol_bandwidth = silverman_bandwidth(np.array(mol)) mol_points = len(mol) ages = [float(age) for age in range(20, 59)]