def checkForOpenGraphTag(self):
     """\
     checks to see if we were able to 
     find open graph tags on this page
     """
     node = self.article.rawDoc
     meta = Parser.getElementsByTag(node,
                                    tag='meta',
                                    attr='property',
                                    value='og:image')
     for item in meta:
         href = Parser.getAttribute(item, attr='content')
         if href:
             mainImage = Image()
             mainImage.imageSrc = href
             mainImage.imageExtractionType = "opengraph"
             mainImage.confidenceScore = 100
             locallyStoredImage = self.getLocallyStoredImage(
                 mainImage.imageSrc)
             if locallyStoredImage:
                 mainImage.bytes = locallyStoredImage.bytes
                 mainImage.height = locallyStoredImage.height
                 mainImage.width = locallyStoredImage.width
                 return mainImage
     return None
 def checkForLinkTag(self):
     """\
     checks to see if we were able to 
     find open link_src on this page
     """
     node = self.article.rawDoc
     meta = Parser.getElementsByTag(node,
                                    tag='link',
                                    attr='rel',
                                    value='image_src')
     for item in meta:
         href = Parser.getAttribute(item, attr='href')
         if href:
             mainImage = Image()
             mainImage.imageSrc = href
             mainImage.imageExtractionType = "linktag"
             mainImage.confidenceScore = 100
             locallyStoredImage = self.getLocallyStoredImage(
                 mainImage.imageSrc)
             if locallyStoredImage:
                 mainImage.bytes = locallyStoredImage.bytes
                 mainImage.height = locallyStoredImage.height
                 mainImage.width = locallyStoredImage.width
                 return mainImage
     return None
Пример #3
0
    def checkForKnownElements(self):
        """\
        in here we check for known image contains from sites
        we've checked out like yahoo, techcrunch, etc... that have
        * known  places to look for good images.
        * TODO: enable this to use a series of settings files
          so people can define what the image ids/classes
          are on specific sites
        """
        domain = self.getCleanDomain()
        if domain in self.customSiteMapping.keys():
            classes = self.customSiteMapping.get(domain).split('|')
            for classname in classes:
                KNOWN_IMG_DOM_NAMES.append(classname)

        knownImage = None

        for knownName in KNOWN_IMG_DOM_NAMES:
            known = Parser.getElementById(self.article.rawDoc, knownName)
            if not known:
                known = Parser.getElementsByTag(self.article.rawDoc,
                                                attr='class',
                                                value=knownName)
                if known:
                    known = known[0]
            if known:
                mainImage = Parser.getElementsByTag(known, tag='img')
                if mainImage:
                    knownImage = mainImage[0]

        if knownImage is not None:
            knownImgSrc = Parser.getAttribute(knownImage, attr='src')
            mainImage = Image()
            mainImage.imageSrc = self.buildImagePath(knownImgSrc)
            mainImage.imageExtractionType = "known"
            mainImage.confidenceScore = 90
            locallyStoredImage = self.getLocallyStoredImage(mainImage.imageSrc)
            if locallyStoredImage:
                mainImage.bytes = locallyStoredImage.bytes
                mainImage.height = locallyStoredImage.height
                mainImage.width = locallyStoredImage.width

            return mainImage
    def checkForKnownElements(self):
        """\
        in here we check for known image contains from sites
        we've checked out like yahoo, techcrunch, etc... that have
        * known  places to look for good images.
        * TODO: enable this to use a series of settings files
          so people can define what the image ids/classes
          are on specific sites
        """
        domain = self.getCleanDomain()
        if domain in self.customSiteMapping.keys():
            classes = self.customSiteMapping.get(domain).split('|')
            for classname in classes:
                KNOWN_IMG_DOM_NAMES.append(classname)

        knownImage = None

        for knownName in KNOWN_IMG_DOM_NAMES:
            known = Parser.getElementById(self.article.rawDoc, knownName)
            if not known:
                known = Parser.getElementsByTag(self.article.rawDoc,
                                                attr='class', value=knownName)
                if known:
                    known = known[0]
            if known:
                mainImage = Parser.getElementsByTag(known, tag='img')
                if mainImage:
                    knownImage = mainImage[0]

        if knownImage is not None:
            knownImgSrc = Parser.getAttribute(knownImage, attr='src')
            mainImage = Image()
            mainImage.imageSrc = self.buildImagePath(knownImgSrc)
            mainImage.imageExtractionType = "known"
            mainImage.confidenceScore = 90
            locallyStoredImage = self.getLocallyStoredImage(mainImage.imageSrc)
            if locallyStoredImage:
                mainImage.bytes = locallyStoredImage.bytes
                mainImage.height = locallyStoredImage.height
                mainImage.width = locallyStoredImage.width

            return mainImage
 def checkForOpenGraphTag(self):
     """\
     checks to see if we were able to
     find open graph tags on this page
     """
     node = self.article.rawDoc
     meta = Parser.getElementsByTag(node, tag='meta', attr='property', value='og:image')
     for item in meta:
         href = Parser.getAttribute(item, attr='content')
         if href:
             mainImage = Image()
             mainImage.imageSrc = href
             mainImage.imageExtractionType = "opengraph"
             mainImage.confidenceScore = 100
             locallyStoredImage = self.getLocallyStoredImage(mainImage.imageSrc)
             if locallyStoredImage:
                 mainImage.bytes = locallyStoredImage.bytes
                 mainImage.height = locallyStoredImage.height
                 mainImage.width = locallyStoredImage.width
                 return mainImage
     return None
 def checkForLinkTag(self):
     """\
     checks to see if we were able to
     find open link_src on this page
     """
     node = self.article.rawDoc
     meta = Parser.getElementsByTag(node, tag='link', attr='rel', value='image_src')
     for item in meta:
         href = Parser.getAttribute(item, attr='href')
         if href:
             mainImage = Image()
             mainImage.imageSrc = href
             mainImage.imageExtractionType = "linktag"
             mainImage.confidenceScore = 100
             locallyStoredImage = self.getLocallyStoredImage(mainImage.imageSrc)
             if locallyStoredImage:
                 mainImage.bytes = locallyStoredImage.bytes
                 mainImage.height = locallyStoredImage.height
                 mainImage.width = locallyStoredImage.width
                 return mainImage
     return None