예제 #1
0
    def mdex(self):
        '''
        Metadata extraction and target association.
        '''
        # Pass the document through a different extractor based on how the URL starts.
        try:
            if (self.doc["document_url"].startswith("https://www.gov.uk/")):
                self.mdex_gov_uk_publications()
            elif (self.doc["document_url"].startswith("http://www.ifs.org.uk/")
                  ):
                self.mdex_ifs_reports()
            else:
                self.mdex_default()
        except Exception as e:
            logger.error(
                "Ignoring error during extraction for document %s and landing page %s"
                % (self.doc['document_url'], self.doc['landing_page_url']))
            logger.exception(e)

        if not 'title' in self.doc or not self.doc['title']:
            logger.info("Falling back on default extraction logic...")
            self.mdex_default()
            logger.info("GOT %s" % self.doc)

        # Look up which Target this URL should be associated with:
        if self.targets and self.doc.has_key('landing_page_url'):
            logger.info(
                "Looking for match for %s source %s and publishers '%s'" %
                (self.doc['landing_page_url'], self.source,
                 self.doc.get('publishers', [])))
            self.doc['target_id'] = self.find_watched_target_for(
                self.doc['landing_page_url'], self.source,
                self.doc.get('publishers', []))

        # If there is no association, drop it:
        if not self.doc.get('target_id',
                            None) and self.null_if_no_target_found:
            logger.critical(
                "Failed to associated document with any target: %s" % self.doc)
            return None

        # If the publisher appears unambiguous, store it where it can be re-used
        if len(self.doc.get('publishers', [])) is 1:
            self.doc['publisher'] = self.doc['publishers'][0]

        # Or return the modified version:
        return self.doc
예제 #2
0
 def mdex_ifs_reports(self):
     # Do not try to extract metadata if we got there from the feed:
     if '/publications/feed/' in self.doc["landing_page_url"]:
             self.mdex_default()
             return
     # Grab the landing page URL as HTML
     r = requests.get(self.lp_wb_url())
     h = html.fromstring(r.content)
     # Extract the metadata:
     self.doc['title'] = self._get0(h.xpath("//*[contains(@itemtype, 'http://schema.org/CreativeWork')]//*[contains(@itemprop,'name')]/text()")).strip()
     self.doc['publication_date'] = self._get0(h.xpath("//*[contains(@itemtype, 'http://schema.org/CreativeWork')]//*[contains(@itemprop,'datePublished')]/@content"))
     self.doc['authors'] = h.xpath("//*[contains(@itemtype, 'http://schema.org/CreativeWork')]//*[contains(@itemprop,'author')]/a/text()")
     self.doc['publishers'] = h.xpath("//footer//*[contains(@itemtype, 'http://schema.org/Organization')]//*[contains(@itemprop,'name')]/text()")
     self.doc['isbn'] = self._get0(h.xpath("//*[contains(@itemtype, 'http://schema.org/CreativeWork')]//tr[td[1]/text()='ISBN:']/td[2]/text()")).strip()
     self.doc['doi'] = self._get0(h.xpath("//*[contains(@itemtype, 'http://schema.org/CreativeWork')]//tr[td[1]/text()='DOI:']/td[2]/a[1]/text()"))
     if not 'www.ifs.org.uk' in self.doc['document_url']:
         logger.critical("Dropping off-site publication discovered on the IFS site. %s " % self.doc['document_url'])
         self.doc = dict()
예제 #3
0
    def mdex(self):
        '''
        Metadata extraction and target association.
        '''
        # Pass the document through a different extractor based on how the URL starts.
        try:
            if( self.doc["document_url"].startswith("https://www.gov.uk/")):
                self.mdex_gov_uk_publications()
            elif( self.doc["document_url"].startswith("http://www.ifs.org.uk/")):
                self.mdex_ifs_reports()
            else:
                self.mdex_default()
        except Exception as e:
            logger.error("Ignoring error during extraction for document %s and landing page %s" % (self.doc['document_url'], self.doc['landing_page_url']))
            logger.exception(e)

        if not 'title' in self.doc or not self.doc['title']:
            logger.info("Falling back on default extraction logic...")
            self.mdex_default()
            logger.info("GOT %s" % self.doc)

        # Look up which Target this URL should be associated with:
        if self.targets and self.doc.has_key('landing_page_url'):
            logger.info("Looking for match for %s source %s and publishers '%s'" % (self.doc['landing_page_url'], self.source, self.doc.get('publishers',[])))
            self.doc['target_id'] = self.find_watched_target_for(self.doc['landing_page_url'], self.source, self.doc.get('publishers', []))
        
        # If there is no association, drop it:
        if not self.doc.get('target_id', None) and self.null_if_no_target_found:
            logger.critical("Failed to associated document with any target: %s" % self.doc)
            return None

        # If the publisher appears unambiguous, store it where it can be re-used
        if len(self.doc.get('publishers',[])) is 1:
            self.doc['publisher'] = self.doc['publishers'][0]
            
        # Or return the modified version:
        return self.doc
예제 #4
0
 def mdex_ifs_reports(self):
     # Do not try to extract metadata if we got there from the feed:
     if '/publications/feed/' in self.doc["landing_page_url"]:
         self.mdex_default()
         return
     # Grab the landing page URL as HTML
     r = requests.get(self.lp_wb_url())
     h = html.fromstring(r.content)
     # Extract the metadata:
     self.doc['title'] = self._get0(
         h.xpath(
             "//*[contains(@itemtype, 'http://schema.org/CreativeWork')]//*[contains(@itemprop,'name')]/text()"
         )).strip()
     self.doc['publication_date'] = self._get0(
         h.xpath(
             "//*[contains(@itemtype, 'http://schema.org/CreativeWork')]//*[contains(@itemprop,'datePublished')]/@content"
         ))
     self.doc['authors'] = h.xpath(
         "//*[contains(@itemtype, 'http://schema.org/CreativeWork')]//*[contains(@itemprop,'author')]/a/text()"
     )
     self.doc['publishers'] = h.xpath(
         "//footer//*[contains(@itemtype, 'http://schema.org/Organization')]//*[contains(@itemprop,'name')]/text()"
     )
     self.doc['isbn'] = self._get0(
         h.xpath(
             "//*[contains(@itemtype, 'http://schema.org/CreativeWork')]//tr[td[1]/text()='ISBN:']/td[2]/text()"
         )).strip()
     self.doc['doi'] = self._get0(
         h.xpath(
             "//*[contains(@itemtype, 'http://schema.org/CreativeWork')]//tr[td[1]/text()='DOI:']/td[2]/a[1]/text()"
         ))
     if not 'www.ifs.org.uk' in self.doc['document_url']:
         logger.critical(
             "Dropping off-site publication discovered on the IFS site. %s "
             % self.doc['document_url'])
         self.doc = dict()