def fit(self): """ Processes the creation of the contents and serializes the contents. This method starts the content production process and initializes everything that will be used to create said contents, their fields and their representations """ # before starting the process, the content analyzer manin checks that there are no duplicate id cases # both in the field dictionary and in the exogenous representation list # this is done now and not recursively for each content during the creation process, in order to avoid starting # an operation that is going to fail try: self.__check_field_dict() self.__check_exogenous_representation_list() except ValueError as e: raise e # creates the directory where the data will be serialized and overwrites it if it already exists output_path = self.__config.output_directory if os.path.exists(output_path): shutil.rmtree(output_path) os.mkdir(output_path) contents_producer = ContentsProducer.get_instance() contents_producer.set_config(self.__config) created_contents = contents_producer.create_contents() if self.__config.export_json: json_path = os.path.join(self.__config.output_directory, 'contents.json') with open(json_path, "w") as data: json.dump(created_contents, data, cls=ContentEncoder, indent=4) for content in progbar(created_contents, prefix="Serializing contents: "): self.__serialize_content(content)
def get_rated_items(items_directory, ratings) -> List[Content]: """ Gets the items that a user not rated Args: items_directory (str): Path to the items directory ratings (pd.DataFrame): Ratings of the user Returns: unrated_items (List<Content>): List of items that the user has rated """ directory_filename_list = [os.path.splitext(filename)[0] for filename in os.listdir(items_directory) if filename != 'search_index'] # logger.info("Getting filenames from IDs") # list of id of item without rating rated_items_filename_list = set([re.sub(r'[^\w\s]', '', item_id) for item_id in ratings.to_id]) # logger.info("Checking if rated") filename_list = [item_id for item_id in directory_filename_list if item_id in rated_items_filename_list] intersection = [x for x in filename_list if x in directory_filename_list] filename_list = intersection filename_list.sort() rated_items = [ load_content_instance(items_directory, item_id) for item_id in progbar(filename_list, prefix="Loading rated items:")] return rated_items
def import_ratings(self) -> pd.DataFrame: """ Imports the ratings from the source and stores in a dataframe Returns: ratings_frame: pd.DataFrame """ ratings_frame = {'from_id': [], 'to_id': [], 'score': [], 'timestamp': []} for row in progbar(list(self.__source), prefix="Importing ratings:"): ratings_frame['from_id'].append(self._get_field_data(self.from_id_column, row)) ratings_frame['to_id'].append(self._get_field_data(self.to_id_column, row)) if self.timestamp_column: ratings_frame['timestamp'].append(self._get_field_data(self.timestamp_column, row)) ratings_frame['score'].append(self._get_field_data(self.score_column, row)) if len(ratings_frame['timestamp']) == 0: del ratings_frame['timestamp'] if self.score_processor: ratings_frame['score'] = self.score_processor.fit(ratings_frame['score']) else: ratings_frame['score'] = [float(score) for score in ratings_frame['score']] self.rating_frame = pd.DataFrame(ratings_frame) return self.rating_frame
def populate_from_dataframe(self, source_frame: pd.DataFrame): """ Populate the graph using a DataFrame. It must have a 'from_id', 'to_id' and 'score' column. We iterate every row, and create a weighted link for every user and item in the rating frame based on the score the user gave the item, creating the nodes if they don't exist. Args: source_frame (pd.DataFrame): the rating frame from where the graph will be populated """ if self._check_columns(source_frame): for idx, row in progbar(source_frame.iterrows(), max_value=source_frame.__len__(), prefix="Populating Graph:"): self.add_user_node(row['from_id']) self.add_item_node(row['to_id']) if 'label' in source_frame.columns: label = row['label'] else: label = self.get_default_score_label() self.add_link(row['from_id'], row['to_id'], row['score'], label=label) else: raise ValueError('The source frame must contains at least \'from_id\', \'to_id\', \'score\' columns')
def populate_from_dataframe(self, source_frame: pd.DataFrame): """ Populate the graph using a DataFrame. It must have a 'from_id', 'to_id' and 'score' column. We iterate every row, and create a weighted link for every user and item in the rating frame based on the score the user gave the item, creating the nodes if they don't exist. We also add properties to 'item' nodes if the item_contents_dir is specified, and add properties to 'user' nodes if the user_contents_dir is specified. Args: source_frame (pd.DataFrame): the rating frame from where the graph will be populated """ if self._check_columns(source_frame): for idx, row in progbar(source_frame.iterrows(), max_value=source_frame.__len__(), prefix="Populating Graph:"): self.add_user_node(row['from_id']) self.add_item_node(row['to_id']) self.add_link(row['from_id'], row['to_id'], self.normalize_score(row['score']), label=self.get_default_score_label()) if self.get_item_contents_dir() is not None: self._add_item_properties(row) if self.get_user_contents_dir() is not None: self._add_usr_properties(row) else: raise ValueError('The source frame must contains at least \'from_id\', \'to_id\', \'score\' columns')
def eval_metrics( self, metric_list: List[Metric]) -> Tuple[pd.DataFrame, pd.DataFrame]: """ Method which effectively evaluates recommendations generated with the list of metric passed as argument. It returns two Pandas DataFrame, the first one containing system results on all metrics specified, the second one containing each users results for every metric eligible Args: metric_list (List[Metric]): List of metric on which recommendations need to be evaluated Returns: Two pandas DataFrame, the first will contain the system result for every metric specified inside the metric list, the second one will contain every user results for every metric eligible """ frames_to_concat = [] eval_logger.info('Performing metrics chosen') for metric in progbar(metric_list, prefix='Performing {}:', substitute_with_current=True): metric_result_list = [] if self._split_list is None: split_list = metric._get_pred_truth_list() else: split_list = self._split_list for split in split_list: if not split.pred.empty and not split.truth.empty: from_id_valid = split.pred['from_id'] # Remove from truth item of which we do not have predictions split.truth = split.truth.query( 'from_id in @from_id_valid') metric_result = metric.perform(split) metric_result_list.append(metric_result) total_results_metric = pd.concat(metric_result_list) if not total_results_metric.empty: total_results_metric = total_results_metric.groupby( 'from_id').mean() total_results_metric.index.name = 'from_id' frames_to_concat.append(total_results_metric) final_result = pd.concat(frames_to_concat, axis=1) system_results = final_result.loc[['sys']] each_user_result = final_result.drop(['sys']) each_user_result = each_user_result.dropna(axis=1, how='all') return system_results, each_user_result
def eval_fit_recsys(cls, recsys: RecSys, split_list: List[Split], test_items_list: List[pd.DataFrame]): """ Method which is usually called by the 'PredictionCalculator' module that generates recommendation lists. For every user, items that will be ranked are specified by the 'test_items_list' parameter. Rankings generated will be stored into a class attribute (rank_truth_list), which is a list that contains Split objects: every object has two DataFrames, the first one has recommendation lists for every user, the second one has the 'ground truth' for every user. If the class attribute is non-empty, then the 'AlreadyFittedRecSys' exception is raised, so remember to clean the class attribute by calling the private method '_clean_pred_truth_list(...)' upon every new evaluation Args: recsys (RecSys): Recommender system which will generate predictions that will later be evaluated split_list (List[Split]): List of Split objects where every Split contains two DataFrames, the first has the 'train set' for every user, the second has the 'test set' for every user test_items_list (List[pd.DataFrame]): List of DataFrames, one for every Split object inside the split_list parameter, where every DataFrame contains for every user the list of items that must be ranked Raises: AlreadyFittedRecSys exception when the class attribute 'rank_truth_list' is non-empty, meaning that recommendation lists are already been calculated """ if len(cls.rank_truth_list) != 0: raise AlreadyFittedRecSys for counter, (split, test_items_frame) in enumerate(zip( split_list, test_items_list), start=1): train = split.train test = split.test rank_truth = Split() rank_truth.truth = test frame_to_concat = [] user_list_to_fit = set(train.from_id) for user in progbar( user_list_to_fit, prefix='Calculating rank for user {} - split {}'.format( '{}', counter), substitute_with_current=True): user_ratings_train = train.loc[train['from_id'] == user] test_items = list( test_items_frame.query('from_id == @user').to_id) result = recsys._eval_fit_rank(user_ratings_train, test_items) frame_to_concat.append(result) rank_truth.pred = pd.concat(frame_to_concat) cls.rank_truth_list.append(rank_truth)
def get_properties(self, raw_source: RawInformationSource) -> List[EntitiesProp]: """ Produces a list of EntitiesProp objects for every raw content in the raw source where . An Entity Prop object is basically a dict where the keys are the entity linked (since there can be multiple entities in a field) and values are properties retrieved from BabelPy for that entity. EXAMPLE: properties_list = [EntityProp(), EntityProp(), ...] EntityProp.value -> {'DiCaprio': {'babelSynsetID': ..., ...},'Nolan': {'babelSynsetID: ..., ...}, ...} """ properties_list = [] logger.info("Doing Entity Linking with BabelFy") for raw_content in progbar(raw_source, max_value=len(list(raw_source))): data_to_disambiguate = check_not_tokenized( raw_content[self.__field_to_link]) self.__babel_client.babelfy(data_to_disambiguate) properties_content = {} try: if self.__babel_client.merged_entities is not None: for entity in self.__babel_client.merged_entities: properties_entity = { 'babelSynsetID': '', 'DBPediaURL': '', 'BabelNetURL': '', 'score': '', 'coherenceScore': '', 'globalScore': '', 'source': '' } for key in properties_entity: if entity.get(key) is not None: properties_entity[key] = entity[key] properties_content[entity['text']] = properties_entity properties_list.append(EntitiesProp(properties_content)) except AttributeError: raise AttributeError( "BabelFy limit reached! Insert an api key or change it if you inserted one!" ) return properties_list
def add_score_column(self, score_column: Union[str, int], column_name: str, score_processor: RatingProcessor = None): col_to_add = [] for row in progbar(list(self.__source), prefix="Adding column {}:".format(column_name)): col_to_add.append(self._get_field_data(score_column, row)) if score_processor: col_to_add = score_processor.fit(col_to_add) else: col_to_add = [float(score) for score in col_to_add] self.rating_frame[column_name] = col_to_add return self.rating_frame
def populate_from_dataframe(self, source_frame: pd.DataFrame): """ Populate the graph using a DataFrame. It must have a 'from_id', 'to_id' and 'score' column. The method will iterate for every row, and create a weighted link for every user and item in the rating frame based on the score the user gave the item, creating the nodes if they don't exist. We also add properties to 'item' nodes if the item_contents_dir is specified, and add properties to 'user' nodes if the user_contents_dir is specified. Args: source_frame (pd.DataFrame): the rating frame from where the graph will be populated """ if self._check_columns(source_frame): for row in progbar(source_frame.to_dict('records'), max_value=source_frame.__len__(), prefix="Populating Graph:"): # If the node already exists then we don't add it and more importantly # we don't retrieve its exo prop if specified, since they are already been retireved # previously. if not self.node_exists(UserNode(row['from_id'])): self.add_user_node(row['from_id']) if self.get_user_contents_dir() is not None: self._add_usr_properties(row) # If the node already exists then we don't add it and more importantly # we don't retrieve its exo prop if specified, since they are already been retireved # previously. if not self.node_exists(ItemNode(row['to_id'])): self.add_item_node(row['to_id']) if self.get_item_contents_dir() is not None: self._add_item_properties(row) if 'label' in source_frame.columns: label = row['label'] else: label = self.get_default_score_label() self.add_link(UserNode(row['from_id']), ItemNode(row['to_id']), row['score'], label=label) else: raise ValueError( 'The source frame must contains at least \'from_id\', \'to_id\', \'score\' columns' )
def eval_metrics( self, metric_list: List[Metric]) -> Tuple[pd.DataFrame, pd.DataFrame]: frames_to_concat = [] eval_logger.info('Performing metrics chosen') for metric in progbar(metric_list, prefix='Performing {}:', substitute_with_current=True): metric_result_list = [] if self._split_list is None: split_list = metric._get_pred_truth_list() else: split_list = self._split_list for split in split_list: if not split.pred.empty and not split.truth.empty: from_id_valid = split.pred['from_id'] # Remove from truth item of which we do not have predictions split.truth = split.truth.query( 'from_id in @from_id_valid') metric_result = metric.perform(split) metric_result_list.append(metric_result) total_results_metric = pd.concat(metric_result_list) if not total_results_metric.empty: total_results_metric = total_results_metric.groupby( 'from_id').mean() total_results_metric.index.name = 'from_id' frames_to_concat.append(total_results_metric) final_result = pd.concat(frames_to_concat, axis=1) system_results = final_result.loc[['sys']] each_user_result = final_result.drop(['sys']) each_user_result = each_user_result.dropna(axis=1, how='all') return system_results, each_user_result
def split_all(self, ratings: pd.DataFrame, user_id_list: Set[str]): """ Method that effectively splits the 'ratings' parameter into 'train set' and 'test set'. It must be specified a 'user_id_list' parameter so that the method will do the splitting only for the users specified inside the list. Args: ratings (pd.DataFrame): The DataFrame which contains the interactions of the users that must be splitted into 'train set' and 'test set' user_id_list (Set[str]): The set of users for which splitting will be done """ split_list = [] eval_logger.info("Performing {} on ratings of every user".format( str(self._partition_technique))) for user_id in progbar(user_id_list, prefix="Current user - {}:", substitute_with_current=True): user_ratings = ratings[ratings['from_id'] == user_id] try: user_splits_list = self._split_single(user_ratings) except PartitionError as e: eval_logger.warning( str(e) + "\nThe user {} will be skipped".format(user_id)) continue if len(split_list) != 0: for user_split, total_split in zip(user_splits_list, split_list): total_split.train = pd.concat( [total_split.train, user_split.train]) total_split.test = pd.concat( [total_split.test, user_split.test]) else: for user_split in user_splits_list: split_list.append(user_split) # Only executed once return split_list
def split_all(self, ratings: pd.DataFrame, user_id_list: Set[str]): split_list = [] eval_logger.info("Performing {} on ratings of every user".format(str(self._partition_technique))) for user_id in progbar(user_id_list, prefix="Current user - {}:", substitute_with_current=True): user_ratings = ratings[ratings['from_id'] == user_id] try: user_splits_list = self._split_single(user_ratings) except PartitionError as e: eval_logger.warning(str(e) + "\nThe user {} will be skipped".format(user_id)) continue if len(split_list) != 0: for user_split, total_split in zip(user_splits_list, split_list): total_split.train = pd.concat([total_split.train, user_split.train]) total_split.test = pd.concat([total_split.test, user_split.test]) else: for user_split in user_splits_list: split_list.append(user_split) # Only executed once return split_list
def eval_fit_recsys(cls, recsys: RecSys, split_list: List[Split], test_items_list: List[pd.DataFrame]): if len(cls.rank_truth_list) != 0: raise AlreadyFittedRecSys for counter, (split, test_items_frame) in enumerate(zip( split_list, test_items_list), start=1): train = split.train test = split.test rank_truth = Split() rank_truth.truth = test frame_to_concat = [] user_list_to_fit = set(train.from_id) for user in progbar( user_list_to_fit, prefix='Calculating rank for user {} - split {}'.format( '{}', counter), substitute_with_current=True): user_ratings_train = train.loc[train['from_id'] == user] test_items = list( test_items_frame.query('from_id == @user').to_id) result = recsys._eval_fit_rank(user_ratings_train, test_items) frame_to_concat.append(result) rank_truth.pred = pd.concat(frame_to_concat) cls.rank_truth_list.append(rank_truth)