示例#1
0
 def get_article_representation(self, article_title):
     ''' Returns a bag of words build from the text content
     on the given article.
     @return: a list of tokens '''
     article_text = wikipedia_api_util.query_page_content_text(article_title) # get the text of the article's page
     cleaned_tokens = text_util.get_clean_BOW_doc(article_text) # clean the article text 
     return cleaned_tokens
示例#2
0
 def get_article_representation(self, article_title):
     ''' Returns a bag of words build from the title
     of the categories of the given article.
     @return: a list of tokens '''
     category_titles = wikipedia_api_util.query_categories_of_res(article_title)
     category_titles_str = self.__format_category__(' '.join(category_titles))
     cleaned_titles = text_util.get_clean_BOW_doc(category_titles_str)
     return cleaned_titles
示例#3
0
 def get_article_representation(self, article_title):
     ''' Returns a bag of words build from the title of the given article.
     @return: a list of tokens '''
     article_title_str = article_title.replace('_', ' ')
     cleaned_title = text_util.get_clean_BOW_doc(article_title_str)
     return cleaned_title