示例#1
0
 def weight_links(self, window_width = 1):
     """
     result: [(link, weight)]:
     - link is BeautifulSoup tag (<a>)
     - weight is integer
     """
     content_list, links = ContentExtractor.extract_mapped_content_list(self._body_soup)
     link_count = len(links)
     if not self._words:
         self._words = [s for s in content_list if type(s) != int]
         self._weighted_words = map(self._weight_words, self._words)
     weights = self._weighted_words
     left = 0
     right = window_width
     win_weight = reduce( lambda x,y: x + y, weights[0:min(window_width,link_count)], 0 )
     weighted_links = []
     if link_count <= window_width:
         return map( lambda x: (x, win_weight), links )
     for i in xrange(window_width):
         weighted_links.append( (links[i], win_weight) )
         right += 1
         win_weight += weights[right]
     for i in xrange(window_width, link_count - window_width):
         weighted_links.append( (links[i], win_weight) )
         left += 1
         right += 1
         win_weight += weights[right] - weights[left]
     for i in xrange( link_count - window_width, link_count ):
         weighted_links.append( (links[i], win_weight) )
         left += 1
         win_weight -= weights[left]
     return weighted_links
示例#2
0
 def __init__(self, mode):
     self.blockmaker = BlockMaker.BlockMaker()
     self.url = ""
     self.mode = ""
     if mode == 1 or mode == "news":
         self.mode = " 1"
         self.extractor = ContentExtractor.ContentExtractor(1)
     elif mode == 2 or mode == "blog":
         self.mode = " 2"
         self.extractor = ContentExtractor.ContentExtractor(2)
     elif mode == 3 or mode == "shop":
         self.mode = " 3"
         self.extractor = ContentExtractor.ContentExtractor(3)
     else:
         raise ValueError("Select mode \"news\" or \"blog\" or \"shop\" for 1st argument!")
     self.titles = []
     self.texts = []
     self.images = []
     self.boxes = []
示例#3
0
 def get_page_weight(self):
     content_list = ContentExtractor.extract_mapped_content_list(self._body_soup)[0]
     if not self._words:
         self._words = [s for s in content_list if type(s) != int]
         self._weighted_words = map(self._weight_words, self._words)
     return sum(self._weighted_words)