def tfidf_profile(self,items_repository,size,content_filter): """ Return the most relevant tags for the user list of packages based on the sublinear tfidf weight of packages' tags. """ docs = data.axi_search_pkgs(items_repository,self.pkg_profile) #weights = data.tfidf_plus(items_repository,docs,content_filter) weights = data.tfidf_weighting(items_repository,docs,content_filter) # Eliminate duplicated stemmed term profile = self._eliminate_duplicated([w[0] for w in weights],size) return profile
def eset_profile(self,items_repository,size,content_filter): """ Return most relevant tags for a list of packages. """ # Store package documents in a relevant set enquire = xapian.Enquire(items_repository) docs = data.axi_search_pkgs(items_repository,self.pkg_profile) rset_packages = xapian.RSet() for d in docs: rset_packages.add_document(d.docid) # Get expanded query terms (statistically good differentiators) eset_tags = enquire.get_eset(size*2,rset_packages, xapian.Enquire.INCLUDE_QUERY_TERMS, 1,content_filter) # Eliminate duplicated stemmed term profile = self._eliminate_duplicated([res.term for res in eset_tags],size) return profile
def generate_all_terms_tfidf(): global user_tfidf_weights axipath = os.path.expanduser("~/.app-recommender/axi_desktopapps/") axi_index = xapian.Database(axipath) dpkg_output = commands.getoutput('apt-mark showmanual') pkgs = [pkg for pkg in dpkg_output.splitlines() if not pkg.startswith('lib')] docs = data.axi_search_pkgs(axi_index, pkgs) tags_weights = data.tfidf_weighting(axi_index, docs, FilterTag(0), time_context=0) description_weights = (data.tfidf_weighting(axi_index, docs, FilterDescription(), time_context=0)) user_tfidf_weights = dict(tags_weights + description_weights)