def check_large_images(self, node, parent_depth_level, sibling_depth_level): """\ although slow the best way to determine the best image is to download them and check the actual dimensions of the image when on disk so we'll go through a phased approach... 1. get a list of ALL images from the parent node 2. filter out any bad image names that we know of (gifs, ads, etc..) 3. do a head request on each file to make sure it meets our bare requirements 4. any images left over let's do a full GET request, download em to disk and check their dimensions 5. Score images based on different factors like height/width and possibly things like color density """ good_images = self.get_image_candidates(node) if good_images: scored_images = self.fetch_images(good_images, parent_depth_level) if scored_images: highscore_image = sorted(list(scored_images.items()), key=lambda x: x[1], reverse=True)[0][0] main_image = Image() main_image.src = highscore_image.src main_image.extraction_type = "bigimage" main_image.confidence_score = 100 / len(scored_images) \ if len(scored_images) > 0 else 0 return main_image depth_obj = self.get_depth_level(node, parent_depth_level, sibling_depth_level) if depth_obj: return self.check_large_images(depth_obj.node, depth_obj.parent_depth, depth_obj.sibling_depth) return None
def scored_image_to_result_image(self, scored_img, scored_imgs_len): img = Image() img.src = scored_img.src img.width = scored_img.width img.height = scored_img.height img.extraction_type = "bigimage" img.confidence_score = 100 / scored_imgs_len return img
def get_best_image(self, doc, topNode): image = self.check_known_elements() if image: return image image = self.check_large_images(topNode, 0, 0) if image: return image image = self.check_meta_tag() if image: return image return Image()
def get_image(self, element, src, score=100, extraction_type="N/A"): # build the Image object image = Image() image.src = self.build_image_path(src) image.extraction_type = extraction_type image.confidence_score = score # check if we have a local image # in order to add more information # on the Image object local_image = self.get_local_image(image.src) if local_image: image.bytes = local_image.bytes image.height = local_image.height image.width = local_image.width # return the image return image
def check_known_elements(self): """\ in here we check for known image contains from sites we've checked out like yahoo, techcrunch, etc... that have * known places to look for good images. * TODO: enable this to use a series of settings files so people can define what the image ids/classes are on specific sites """ domain = self.get_clean_domain() if domain in self.custom_site_mapping.keys(): classes = self.custom_site_mapping.get(domain).split('|') for classname in classes: KNOWN_IMG_DOM_NAMES.append(classname) known_image = None for known_name in KNOWN_IMG_DOM_NAMES: known = self.parser.getElementById(self.article.raw_doc, known_name) if not known: known = self.parser.getElementsByTag(self.article.raw_doc, attr='class', value=known_name) if known: known = known[0] if known: main_image = self.parser.getElementsByTag(known, tag='img') if main_image: known_image = main_image[0] if known_image is not None: known_image_source = self.parser.getAttribute(known_image, attr='src') main_image = Image() main_image.src = self.build_image_path(known_image_source) main_image.extraction_type = "known" main_image.confidence_score = 90 local_image = self.get_local_image(main_image.src) if local_image: main_image.bytes = local_image.bytes main_image.height = local_image.height main_image.width = local_image.width return main_image
def check_opengraph_tag(self): """\ checks to see if we were able to find open graph tags on this page """ node = self.article.raw_doc meta = self.parser.getElementsByTag(node, tag='meta', attr='property', value='og:image') for item in meta: href = self.parser.getAttribute(item, attr='content') if href: main_image = Image() main_image.src = href main_image.extraction_type = "opengraph" main_image.confidence_score = 100 local_image = self.get_local_image(main_image.src) if local_image: main_image.bytes = local_image.bytes main_image.height = local_image.height main_image.width = local_image.width return main_image return None
def check_link_tag(self): """\ checks to see if we were able to find open link_src on this page """ node = self.article.raw_doc meta = self.parser.getElementsByTag(node, tag='link', attr='rel', value='image_src') for item in meta: href = self.parser.getAttribute(item, attr='href') if href: main_image = Image() main_image.src = href main_image.extraction_type = "linktag" main_image.confidence_score = 100 local_image = self.get_local_image(main_image.src) if local_image: main_image.bytes = local_image.bytes main_image.height = local_image.height main_image.width = local_image.width return main_image return None
def getExpectedImage(self, expected_value): image = Image() for k, v in expected_value.items(): setattr(image, k, v) return image