def pubmed_adapter(search=None, id=None): ''' Sample queries: #curl "http://localhost:8880/pubmed?" curl "http://localhost:8880/pubmed?search=stem+cells" curl "http://localhost:8880/pubmed?id=19358275" ''' #FIXME: How do we handle no search or id param? Just serve up the latest entries? Or error as below? #assert_(not(search and id), msg="You must specify the 'search' or 'id' query parameter is mandatory.") if search: #search = first_item(search) #reldate: only search for last N days #query = urllib.urlencode({'db' : NCBI_DB, 'term': query, 'reldate': '60', 'datetype': 'edat', 'retmax': DEFAULT_MAX_RESULTS, 'usehistory': 'y'}) query = urllib.urlencode({'term': search, 'db' : NCBI_DB, 'datetype': 'edat', 'retmax': DEFAULT_MAX_RESULTS, 'usehistory': 'y'}) search_url = NCBI_SEARCH_PATTERN + query logger.debug("Term search URL: " + search_url) doc = bindery.parse(search_url, standalone=True) search_terms = search ids = ( unicode(i) for i in doc.eSearchResult.IdList.Id ) ids = ','.join(ids) self_link = '/pubmed?search='+search else: #ids = first_item(id) #fulltext = fulltext[0] if fulltext else u'no' #if fulltext == 'yes': search_terms = ids self_link = '/pubmed?id='+ids query = urllib.urlencode({'db' : NCBI_DB, 'id': ids, 'retmode': 'xml'}) search_url = NCBI_ARTICLE_ACCESS_PATTERN + query logger.debug("ID search URL: " + search_url) alt_link = search_url doc = bindery.parse(search_url, standalone=True, model=PUBMED_MODEL) #doc = bindery.parse(open('/Users/uche/tmp/efetch.fcgi.html'), standalone=True, model=PUBMED_MODEL) metadata, first_id = metadata_dict(generate_metadata(doc)) return atom_results(doc, metadata, self_link, alt_link, search_terms)
def list_records(self, set="", resumption_token = ""): ''' List records. Use either the resumption token or set id. ''' if resumption_token: params = {'verb' : 'ListRecords', 'resumptionToken': resumption_token} else: params = {'verb' : 'ListRecords', 'metadataPrefix': 'oai_dc', 'set': set} qstr = urllib.urlencode(params) url = self.root + '?' + qstr self.logger.debug('OAI request URL: {0}'.format(url)) start_t = time.time() resp, content = self.h.request(url) retrieved_t = time.time() self.logger.debug('Retrieved in {0}s'.format(retrieved_t - start_t)) doc = bindery.parse(url, model=OAI_LISTRECORDS_MODEL) records, first_id = metadata_dict(generate_metadata(doc), nesteddict=False) for id_, props in records: for k, v in props.iteritems(): props[k] = [ U(item) for item in v ] if (doc.OAI_PMH.ListRecords is not None) and (doc.OAI_PMH.ListRecords.resumptionToken is not None): resumption_token = U(doc.OAI_PMH.ListRecords.resumptionToken) else: resumption_token = '' return {'records' : records, 'resumption_token' : resumption_token}
def factory(rest_uri, moin_link=None, opener=None): opener = opener or urllib2.build_opener() logger.debug('rest_uri: ' + rest_uri) req = urllib2.Request(rest_uri, headers={'Accept': DOCBOOK_IMT}) resp = opener.open(req) doc = bindery.parse(resp, standalone=True, model=MOIN_DOCBOOK_MODEL) original_wiki_base = dict(resp.info())[ORIG_BASE_HEADER] #self.original_wiki_base = dict(resp.info())[ORIG_BASE_HEADER] #amara.xml_print(self.content_cache) metadata, first_id = metadata_dict(generate_metadata(doc)) metadata = metadata[first_id] akara_type = U(metadata[u'ak-type']) logger.debug('Type: ' + akara_type) try: #Older Moin CMS resource types are implemented by registration to the global node.NODES cls = node.NODES[akara_type] except KeyError: #Newer Moin CMS resource types are implemented by discovery of a URL, #to which a POST request executes the desired action return node.ENDPOINTS and (rest_uri, akara_type, node.ENDPOINTS[akara_type], doc, metadata, original_wiki_base) else: instance = cls(rest_uri, moin_link, opener, cache=(doc, metadata, original_wiki_base)) return instance
def search(self, term): qstr = urllib.urlencode({'verb' : 'GetRecord', 'metadataPrefix': 'oai_dc', 'identifier': dspace_id}) url = DSPACE_OAI_ENDPOINT + '?' + qstr logger.debug('DSpace URL: ' + str(url)) #keywords = [ (k.strip(), JOVE_TAG) for k in unicode(row.xml_select(u'string(.//*[@class="keywords"])')).split(',') ] doc = bindery.parse(url, model=OAI_MODEL) #print >> sys.stderr, list(generate_metadata(doc)) resources, first_id = metadata_dict(generate_metadata(doc), nesteddict=False) record = doc.OAI_PMH resource = resources[first_id]
def atomize_oai_record(endpoint=None, id=None): ''' endpoint - the OAI request URL, e.g. http://dspace.mit.edu/oai/request id, e.g. the article ID, e.g. oai:dspace.mit.edu:1721.1/5451 Sample request: curl "http://localhost:8880/akara.oai.atom?endpoint=http://dspace.mit.edu/oai/request&id=oai:dspace.mit.edu:1721.1/5451" ''' if endpoint is None: raise ValueError('endpoint required') if id is None: raise ValueError('id required') qstr = urllib.urlencode({ 'verb': 'GetRecord', 'metadataPrefix': 'oai_dc', 'identifier': id }) url = endpoint + '?' + qstr doc = bindery.parse(url, model=OAI_MODEL) resources = metadata_dict(generate_metadata(doc)) #print resources f = feed(ATOM_ENVELOPE) #f = feed(ATOM_ENVELOPE, title=resources['title'], id=resources['id']) #f.source.feed.xml_append(E((ATOM_NAMESPACE, u'link'), {u'rel': u'self', u'type': u'application/atom+xml', u'href': self_link.decode('utf-8')})) #f.source.feed.xml_append(E((ATOM_NAMESPACE, u'link'), {u'rel': u'search', u'type': u'application/opensearchdescription+xml', u'href': u'http://kds-kci.zepheira.com/sciencedirect.discovery'})) #f.source.feed.xml_append(E((ATOM_NAMESPACE, u'link'), {u'rel': u'alternate', u'type': u'text/xml', u'href': alt_link.decode('utf-8')})) #f.source.feed.xml_append(E((OPENSEARCH_NAMESPACE, u'Query'), {u'role': u'request', u'searchTerms': search_terms.decode('utf-8')})) #maxarticles = DEFAULT_MAX_RESULTS maxarticles = 3 for record in islice(doc.OAI_PMH, 0, maxarticles): resource = unicode(resources[id]) print resource authors = [(a, None, None) for a in unicode(resource[u'creator'])] links = [ (unicode(resource['handle']), u'alternate'), ] #categories = [ (unicode(k), SD_NS+u'authorKeyword') for k in authkw(article) ] #elements = [ # E((SD_NS, u'sd:journal-cover'), unicode(article.journalCover).strip() if hasattr(article, 'journalCover') else DEFAULT_ICON), # E((SD_NS, u'sd:journal-name'), unicode(article.journalName)), #] f.append( id, unicode(resource['title'][0]), updated=unicode(resource['date'][0]), summary=unicode(resource['description'][0]), authors=authors, links=links, #categories=categories, #elements=elements, ) return f.source.xml_encode('xml-indent')
def __init__(self, rest_uri, opener): self.rest_uri = rest_uri self.opener = opener #from node.factory req = urllib2.Request(rest_uri, headers={'Accept': DOCBOOK_IMT}) print >> sys.stderr, 'rest_uri: ', rest_uri with closing(opener.open(req)) as resp: doc = bindery.parse(resp, standalone=True, model=MOIN_DOCBOOK_MODEL) original_wiki_base = dict(resp.info())[ORIG_BASE_HEADER] #self.original_wiki_base = dict(resp.info())[ORIG_BASE_HEADER] #amara.xml_print(self.content_cache) metadata = metadata_dict(generate_metadata(doc)) self.cache=(doc, metadata, original_wiki_base) return
def atomize_oai_record(endpoint=None, id=None): ''' endpoint - the OAI request URL, e.g. http://dspace.mit.edu/oai/request id, e.g. the article ID, e.g. oai:dspace.mit.edu:1721.1/5451 Sample request: curl "http://localhost:8880/akara.oai.atom?endpoint=http://dspace.mit.edu/oai/request&id=oai:dspace.mit.edu:1721.1/5451" ''' if endpoint is None: raise ValueError('endpoint required') if id is None: raise ValueError('id required') qstr = urllib.urlencode({'verb' : 'GetRecord', 'metadataPrefix': 'oai_dc', 'identifier': id}) url = endpoint + '?' + qstr doc = bindery.parse(url, model=OAI_MODEL) resources = metadata_dict(generate_metadata(doc)) #print resources f = feed(ATOM_ENVELOPE) #f = feed(ATOM_ENVELOPE, title=resources['title'], id=resources['id']) #f.source.feed.xml_append(E((ATOM_NAMESPACE, u'link'), {u'rel': u'self', u'type': u'application/atom+xml', u'href': self_link.decode('utf-8')})) #f.source.feed.xml_append(E((ATOM_NAMESPACE, u'link'), {u'rel': u'search', u'type': u'application/opensearchdescription+xml', u'href': u'http://kds-kci.zepheira.com/sciencedirect.discovery'})) #f.source.feed.xml_append(E((ATOM_NAMESPACE, u'link'), {u'rel': u'alternate', u'type': u'text/xml', u'href': alt_link.decode('utf-8')})) #f.source.feed.xml_append(E((OPENSEARCH_NAMESPACE, u'Query'), {u'role': u'request', u'searchTerms': search_terms.decode('utf-8')})) #maxarticles = DEFAULT_MAX_RESULTS maxarticles = 3 for record in islice(doc.OAI_PMH, 0, maxarticles): resource = unicode(resources[id]) print resource authors = [ (a, None, None) for a in unicode(resource[u'creator']) ] links = [ (unicode(resource['handle']), u'alternate'), ] #categories = [ (unicode(k), SD_NS+u'authorKeyword') for k in authkw(article) ] #elements = [ # E((SD_NS, u'sd:journal-cover'), unicode(article.journalCover).strip() if hasattr(article, 'journalCover') else DEFAULT_ICON), # E((SD_NS, u'sd:journal-name'), unicode(article.journalName)), #] f.append( id, unicode(resource['title'][0]), updated=unicode(resource['date'][0]), summary=unicode(resource['description'][0]), authors=authors, links=links, #categories=categories, #elements=elements, ) return f.source.xml_encode('xml-indent')
def __init__(self, rest_uri, opener): self.rest_uri = rest_uri self.opener = opener #from node.factory req = urllib2.Request(rest_uri, headers={'Accept': DOCBOOK_IMT}) print >> sys.stderr, 'rest_uri: ', rest_uri with closing(opener.open(req)) as resp: doc = bindery.parse(resp, standalone=True, model=MOIN_DOCBOOK_MODEL) original_wiki_base = dict(resp.info())[ORIG_BASE_HEADER] #self.original_wiki_base = dict(resp.info())[ORIG_BASE_HEADER] #amara.xml_print(self.content_cache) metadata = metadata_dict(generate_metadata(doc)) self.cache = (doc, metadata, original_wiki_base) return
def get_record(self, id): params = {'verb': 'GetRecord', 'metadataPrefix': 'oai_dc', 'identifier': id} qstr = urllib.urlencode(params) url = self.root + '?' + qstr self.logger.debug('OAI request URL: {0}'.format(url)) start_t = time.time() resp, content = self.h.request(url) retrieved_t = time.time() self.logger.debug('Retrieved in {0}s',format(retrieved_t - start_t)) doc = bindery.parse(url, model=OAI_GETRECORD_MODEL) record, rid = metadata_dict(generate_metadata(doc), nesteddict=False) for id_, props in (record if isinstance(record, list) else [record]): for k, v in props.iteritems(): props[k] = [ U(item) for item in v ] return {'record' : record}
def list_records(self, set="", resumption_token="", metadataPrefix=""): ''' List records. Use either the resumption token or set id. ''' error = None if resumption_token: params = {'verb' : 'ListRecords', 'resumptionToken': resumption_token} else: params = {'verb' : 'ListRecords', 'metadataPrefix': metadataPrefix, 'set': set} qstr = urllib.urlencode(params) url = self.root + '?' + qstr self.logger.debug('OAI request URL: {0}'.format(url)) start_t = time.time() resp, content = self.h.request(url) retrieved_t = time.time() self.logger.debug('Retrieved in {0}s'.format(retrieved_t - start_t)) resumption_token = '' if metadataPrefix in ["mods", "marc", "untl"]: xml_content = XML_PARSE(content) records = [] error = getprop(xml_content, "OAI-PMH/error/#text", True) if error is None: for record in xml_content["OAI-PMH"]["ListRecords"]["record"]: id = record["header"]["identifier"] if "null" not in id: records.append((id, record)) if "resumptionToken" in xml_content["OAI-PMH"]["ListRecords"]: resumption_token = xml_content["OAI-PMH"]["ListRecords"]["resumptionToken"] if isinstance(resumption_token, dict): resumption_token = resumption_token.get("#text", "") else: doc = bindery.parse(url, model=LISTRECORDS_MODELS[metadataPrefix]) records, first_id = metadata_dict(generate_metadata(doc), nesteddict=False) for id_, props in records: for k, v in props.iteritems(): props[k] = [ U(item) for item in v ] if (doc.OAI_PMH.ListRecords is not None) and (doc.OAI_PMH.ListRecords.resumptionToken is not None): resumption_token = U(doc.OAI_PMH.ListRecords.resumptionToken) return {'records': records, 'resumption_token': resumption_token, 'error': error}
def list_records(self, set): ''' ''' #e.g. http://dspace.mit.edu/oai/request?verb=ListRecords&metadataPrefix=oai_dc&set=hdl_1721.1_18193 qstr = urllib.urlencode({'verb' : 'ListRecords', 'metadataPrefix': 'oai_dc', 'set': set}) url = self.root + '?' + qstr self.logger.debug('OAI request URL: {0}'.format(url)) start_t = time.time() resp, content = self.h.request(url) retrieved_t = time.time() self.logger.debug('Retrieved in {0}s'.format(retrieved_t - start_t)) doc = bindery.parse(url, model=OAI_LISTRECORDS_MODEL) #print >> sys.stderr, list(generate_metadata(doc)) records, first_id = metadata_dict(generate_metadata(doc), nesteddict=False) for id_, props in records: for k, v in props.iteritems(): props[k] = [ U(item) for item in v ] return records
def factory(rest_uri, relative, outputdir): req = urllib2.Request(rest_uri, headers={'Accept': DOCBOOK_IMT}) resp = urllib2.urlopen(req) doc = bindery.parse(resp, standalone=True, model=MOIN_DOCBOOK_MODEL) original_wiki_base = dict(resp.info())[ORIG_BASE_HEADER] #self.original_wiki_base = dict(resp.info())[ORIG_BASE_HEADER] #amara.xml_print(self.content_cache) output = os.path.join(outputdir, relative) parent_dir = os.path.split(output)[0] try: os.makedirs(parent_dir) except OSError: pass metadata, first_id = metadata_dict(generate_metadata(doc)) metadata = metadata[first_id] akara_type = first_item(first_item(metadata[u'ak-type'])) #import sys; print >> sys.stderr, 'GRIPPO', akara_type.xml_value cls = node.NODES[akara_type.xml_value] instance = cls(rest_uri, relative, outputdir, cache=(doc, metadata, original_wiki_base)) return instance
def list_records(self, set="", resumption_token="", metadataPrefix=""): ''' List records. Use either the resumption token or set id. ''' if resumption_token: params = {'verb' : 'ListRecords', 'resumptionToken': resumption_token} else: params = {'verb' : 'ListRecords', 'metadataPrefix': metadataPrefix, 'set': set} qstr = urllib.urlencode(params) url = self.root + '?' + qstr self.logger.debug('OAI request URL: {0}'.format(url)) start_t = time.time() resp, content = self.h.request(url) retrieved_t = time.time() self.logger.debug('Retrieved in {0}s'.format(retrieved_t - start_t)) if metadataPrefix == "mods" or metadataPrefix == "marc": xml_content = XML_PARSE(content) records = [] for record in xml_content["OAI-PMH"]["ListRecords"]["record"]: id = record["header"]["identifier"] if "null" not in id: records.append((id, record)) if "resumptionToken" in xml_content["OAI-PMH"]["ListRecords"]: resumption_token = xml_content["OAI-PMH"]["ListRecords"]["resumptionToken"] else: resumption_token = '' else: doc = bindery.parse(url, model=LISTRECORDS_MODELS[metadataPrefix]) records, first_id = metadata_dict(generate_metadata(doc), nesteddict=False) for id_, props in records: for k, v in props.iteritems(): props[k] = [ U(item) for item in v ] if (doc.OAI_PMH.ListRecords is not None) and (doc.OAI_PMH.ListRecords.resumptionToken is not None): resumption_token = U(doc.OAI_PMH.ListRecords.resumptionToken) else: resumption_token = '' return {'records' : records, 'resumption_token' : resumption_token}
def test_metadata_extraction(self): """Test metadata extraction""" model = schematron_model(MODEL_A) doc = bindery.parse(INSTANCE_A_1, model=model) metadata = generate_metadata(doc) EXPECTED_MD = [(u'ep', u'place', u'Hailey,ID'), (u'tse', u'place', u'Stamford,CT'), (u'tse', u'opus', u'r2e0e3e5'), (u'r2e0e3e5', u'title', u'The Wasteland'), (u'tse', u'tag', u'old possum'), (u'tse', u'tag', u'poet'), (u'lh', u'place', u'Harlem,NY'), (u'lh', u'tag', u'poet'), (u'co', u'place', u'Idoto,Anambra'), (u'co', u'opus', u'r2e0e7e5'), (u'r2e0e7e5', u'title', u"Heaven's Gate"), (u'co', u'tag', u'biafra'), (u'co', u'tag', u'poet')] #print list(metadata) meta_list = normalize_generated_ids(list(metadata)) self.assertEqual(meta_list, normalize_generated_ids(EXPECTED_MD))
def dspace_adapter(search=None, id=None): ''' Sample queries: curl "http://*****:*****@class="result_table"]//*[@class="article_title"]'): for li in islice(doc.xml_select(u'//*[@id="'+RESULTS_DIV+'"]//*[@class="artifact-description"]/..'), 0, maxarticles): row = li.xml_parent.xml_parent title = li.xml_select(u'.//*[@class="artifact-title"]')[0] rel_id = title.a.href.partition(u'/handle/')[2] dspace_id = DSPACE_ID_BASE + rel_id alt_link = DSPACE_ARTICLE_BASE + u'1721.1/7488' #Do not quote. DSpace doesn't like that #alt_link = DSPACE_ARTICLE_BASE + urllib.quote(u'1721.1/7488', '') title = unicode(title) summary = unicode(row.xml_select(u'string(.//*[@class="summary"])')) updated = unicode(row.xml_select(u'string(.//*[@class="date"])')).strip().partition(u'Published: ')[2] #updated = time.strptime(updated, "%m/%d/%Y %H:%M:%S") #2/11/2008 2:20:00 AM authors = [ (name.strip(), None, None) for name in unicode(row.xml_select(u'string(.//*[@class="author"]//b)')).split(';') ] #Retrieve the DSpace page qstr = urllib.urlencode({'verb' : 'GetRecord', 'metadataPrefix': 'oai_dc', 'identifier': dspace_id}) url = DSPACE_OAI_ENDPOINT + '?' + qstr print >> sys.stderr, url #keywords = [ (k.strip(), JOVE_TAG) for k in unicode(row.xml_select(u'string(.//*[@class="keywords"])')).split(',') ] doc = bindery.parse(url, model=OAI_MODEL) #print >> sys.stderr, list(generate_metadata(doc)) resources, first_id = metadata_dict(generate_metadata(doc)) record = doc.OAI_PMH resource = resources[first_id] authors = [ (a, None, None) for a in resource[u'creator'] ] links = [ (DSPACE_ARTICLE_BASE + rel_id, u'alternate'), (u'dspace?id=' + dspace_id, u'self'), ] elements = [ E((ATOM_NAMESPACE, u'content'), {u'src': alt_link}), ] f.append( dspace_id, U(resource['title']), updated=U(resource['date']), summary=U(resource['description']), authors=authors, links=links, #categories=categories, elements=elements, ) #FIXME: indent return f.xml_encode()