def mdex(self): ''' Metadata extraction and target association. ''' # Pass the document through a different extractor based on how the URL starts. try: if (self.doc["document_url"].startswith("https://www.gov.uk/")): self.mdex_gov_uk_publications() elif (self.doc["document_url"].startswith("http://www.ifs.org.uk/") ): self.mdex_ifs_reports() else: self.mdex_default() except Exception as e: logger.error( "Ignoring error during extraction for document %s and landing page %s" % (self.doc['document_url'], self.doc['landing_page_url'])) logger.exception(e) if not 'title' in self.doc or not self.doc['title']: logger.info("Falling back on default extraction logic...") self.mdex_default() logger.info("GOT %s" % self.doc) # Look up which Target this URL should be associated with: if self.targets and self.doc.has_key('landing_page_url'): logger.info( "Looking for match for %s source %s and publishers '%s'" % (self.doc['landing_page_url'], self.source, self.doc.get('publishers', []))) self.doc['target_id'] = self.find_watched_target_for( self.doc['landing_page_url'], self.source, self.doc.get('publishers', [])) # If there is no association, drop it: if not self.doc.get('target_id', None) and self.null_if_no_target_found: logger.critical( "Failed to associated document with any target: %s" % self.doc) return None # If the publisher appears unambiguous, store it where it can be re-used if len(self.doc.get('publishers', [])) is 1: self.doc['publisher'] = self.doc['publishers'][0] # Or return the modified version: return self.doc
def mdex_ifs_reports(self): # Do not try to extract metadata if we got there from the feed: if '/publications/feed/' in self.doc["landing_page_url"]: self.mdex_default() return # Grab the landing page URL as HTML r = requests.get(self.lp_wb_url()) h = html.fromstring(r.content) # Extract the metadata: self.doc['title'] = self._get0(h.xpath("//*[contains(@itemtype, 'http://schema.org/CreativeWork')]//*[contains(@itemprop,'name')]/text()")).strip() self.doc['publication_date'] = self._get0(h.xpath("//*[contains(@itemtype, 'http://schema.org/CreativeWork')]//*[contains(@itemprop,'datePublished')]/@content")) self.doc['authors'] = h.xpath("//*[contains(@itemtype, 'http://schema.org/CreativeWork')]//*[contains(@itemprop,'author')]/a/text()") self.doc['publishers'] = h.xpath("//footer//*[contains(@itemtype, 'http://schema.org/Organization')]//*[contains(@itemprop,'name')]/text()") self.doc['isbn'] = self._get0(h.xpath("//*[contains(@itemtype, 'http://schema.org/CreativeWork')]//tr[td[1]/text()='ISBN:']/td[2]/text()")).strip() self.doc['doi'] = self._get0(h.xpath("//*[contains(@itemtype, 'http://schema.org/CreativeWork')]//tr[td[1]/text()='DOI:']/td[2]/a[1]/text()")) if not 'www.ifs.org.uk' in self.doc['document_url']: logger.critical("Dropping off-site publication discovered on the IFS site. %s " % self.doc['document_url']) self.doc = dict()
def mdex(self): ''' Metadata extraction and target association. ''' # Pass the document through a different extractor based on how the URL starts. try: if( self.doc["document_url"].startswith("https://www.gov.uk/")): self.mdex_gov_uk_publications() elif( self.doc["document_url"].startswith("http://www.ifs.org.uk/")): self.mdex_ifs_reports() else: self.mdex_default() except Exception as e: logger.error("Ignoring error during extraction for document %s and landing page %s" % (self.doc['document_url'], self.doc['landing_page_url'])) logger.exception(e) if not 'title' in self.doc or not self.doc['title']: logger.info("Falling back on default extraction logic...") self.mdex_default() logger.info("GOT %s" % self.doc) # Look up which Target this URL should be associated with: if self.targets and self.doc.has_key('landing_page_url'): logger.info("Looking for match for %s source %s and publishers '%s'" % (self.doc['landing_page_url'], self.source, self.doc.get('publishers',[]))) self.doc['target_id'] = self.find_watched_target_for(self.doc['landing_page_url'], self.source, self.doc.get('publishers', [])) # If there is no association, drop it: if not self.doc.get('target_id', None) and self.null_if_no_target_found: logger.critical("Failed to associated document with any target: %s" % self.doc) return None # If the publisher appears unambiguous, store it where it can be re-used if len(self.doc.get('publishers',[])) is 1: self.doc['publisher'] = self.doc['publishers'][0] # Or return the modified version: return self.doc
def mdex_ifs_reports(self): # Do not try to extract metadata if we got there from the feed: if '/publications/feed/' in self.doc["landing_page_url"]: self.mdex_default() return # Grab the landing page URL as HTML r = requests.get(self.lp_wb_url()) h = html.fromstring(r.content) # Extract the metadata: self.doc['title'] = self._get0( h.xpath( "//*[contains(@itemtype, 'http://schema.org/CreativeWork')]//*[contains(@itemprop,'name')]/text()" )).strip() self.doc['publication_date'] = self._get0( h.xpath( "//*[contains(@itemtype, 'http://schema.org/CreativeWork')]//*[contains(@itemprop,'datePublished')]/@content" )) self.doc['authors'] = h.xpath( "//*[contains(@itemtype, 'http://schema.org/CreativeWork')]//*[contains(@itemprop,'author')]/a/text()" ) self.doc['publishers'] = h.xpath( "//footer//*[contains(@itemtype, 'http://schema.org/Organization')]//*[contains(@itemprop,'name')]/text()" ) self.doc['isbn'] = self._get0( h.xpath( "//*[contains(@itemtype, 'http://schema.org/CreativeWork')]//tr[td[1]/text()='ISBN:']/td[2]/text()" )).strip() self.doc['doi'] = self._get0( h.xpath( "//*[contains(@itemtype, 'http://schema.org/CreativeWork')]//tr[td[1]/text()='DOI:']/td[2]/a[1]/text()" )) if not 'www.ifs.org.uk' in self.doc['document_url']: logger.critical( "Dropping off-site publication discovered on the IFS site. %s " % self.doc['document_url']) self.doc = dict()