def weight_links(self, window_width = 1): """ result: [(link, weight)]: - link is BeautifulSoup tag (<a>) - weight is integer """ content_list, links = ContentExtractor.extract_mapped_content_list(self._body_soup) link_count = len(links) if not self._words: self._words = [s for s in content_list if type(s) != int] self._weighted_words = map(self._weight_words, self._words) weights = self._weighted_words left = 0 right = window_width win_weight = reduce( lambda x,y: x + y, weights[0:min(window_width,link_count)], 0 ) weighted_links = [] if link_count <= window_width: return map( lambda x: (x, win_weight), links ) for i in xrange(window_width): weighted_links.append( (links[i], win_weight) ) right += 1 win_weight += weights[right] for i in xrange(window_width, link_count - window_width): weighted_links.append( (links[i], win_weight) ) left += 1 right += 1 win_weight += weights[right] - weights[left] for i in xrange( link_count - window_width, link_count ): weighted_links.append( (links[i], win_weight) ) left += 1 win_weight -= weights[left] return weighted_links
def __init__(self, mode): self.blockmaker = BlockMaker.BlockMaker() self.url = "" self.mode = "" if mode == 1 or mode == "news": self.mode = " 1" self.extractor = ContentExtractor.ContentExtractor(1) elif mode == 2 or mode == "blog": self.mode = " 2" self.extractor = ContentExtractor.ContentExtractor(2) elif mode == 3 or mode == "shop": self.mode = " 3" self.extractor = ContentExtractor.ContentExtractor(3) else: raise ValueError("Select mode \"news\" or \"blog\" or \"shop\" for 1st argument!") self.titles = [] self.texts = [] self.images = [] self.boxes = []
def get_page_weight(self): content_list = ContentExtractor.extract_mapped_content_list(self._body_soup)[0] if not self._words: self._words = [s for s in content_list if type(s) != int] self._weighted_words = map(self._weight_words, self._words) return sum(self._weighted_words)