def _extract_biblio(self, redirect_page, id=None): redirect_dict_of_keylists = {'url': ['url']} redirect_dict = provider._extract_from_xml(redirect_page, redirect_dict_of_keylists) logger.info(u"%20s WARNING, url= %s" % (self.provider_name, redirect_dict["url"])) # try to get a response from the data provider response = self.http_get(redirect_dict["url"]) if response.status_code != 200: logger.warning(u"%20s WARNING, status_code=%i getting %s" % (self.provider_name, response.status_code, url)) self._get_error(response.status_code, response) return {} dict_of_keylists = { 'title': ['dataset', 'title'], 'published_date': ['dataset', 'pubDate'] } biblio_dict = provider._extract_from_xml(response.text, dict_of_keylists) return biblio_dict
def _extract_biblio(self, redirect_page, id=None): redirect_dict_of_keylists = { 'url' : ['url'] } redirect_dict = provider._extract_from_xml(redirect_page, redirect_dict_of_keylists) logger.info("%20s WARNING, url= %s" % (self.provider_name, redirect_dict["url"])) # try to get a response from the data provider response = self.http_get(redirect_dict["url"]) if response.status_code != 200: logger.warning("%20s WARNING, status_code=%i getting %s" % (self.provider_name, response.status_code, url)) self._get_error(response.status_code, response) return {} dict_of_keylists = { 'title' : ['dataset', 'title'], 'published_date' : ['dataset', 'pubDate'] } biblio_dict = provider._extract_from_xml(response.text, dict_of_keylists) return biblio_dict
def _extract_biblio(self, page, id=None): dict_of_keylists = {"year": ["PubmedArticleSet", "MedlineCitation", "Article", "ArticleDate", "Year"], "month": ["PubmedArticleSet", "MedlineCitation", "Article", "ArticleDate", "Month"], "day": ["PubmedArticleSet", "MedlineCitation", "Article", "ArticleDate", "Day"], "title": ["PubmedArticleSet", "MedlineCitation", "Article", "ArticleTitle"], "journal": ["PubmedArticleSet", "MedlineCitation", "Article", "Journal", "Title"], } biblio_dict = provider._extract_from_xml(page, dict_of_keylists) dom_authors = provider._find_all_in_xml(page, "LastName") try: biblio_dict["authors"] = ", ".join([author.firstChild.data for author in dom_authors]) except (AttributeError, TypeError): pass try: datetime_published = datetime.datetime(year=biblio_dict["year"], month=biblio_dict["month"], day=biblio_dict["day"]) biblio_dict["date"] = datetime_published.isoformat() del biblio_dict["month"] del biblio_dict["day"] except (AttributeError, TypeError, KeyError): logger.debug("%20s don't have full date information %s" % (self.provider_name, id)) pass return biblio_dict
def _extract_biblio(self, page, id=None): dict_of_keylists = { 'title': ['entry', 'title'], 'date': ['entry', 'published'], } biblio_dict = provider._extract_from_xml(page, dict_of_keylists) dom_authors = provider._find_all_in_xml(page, "name") try: authors = [author.firstChild.data for author in dom_authors] biblio_dict["authors"] = ", ".join( [author.split(" ")[-1] for author in authors]) except (AttributeError, TypeError): pass try: biblio_dict["year"] = biblio_dict["date"][0:4] except KeyError: pass biblio_dict["repository"] = "arXiv" biblio_dict["free_fulltext_url"] = self._get_templated_url( self.aliases_url_template, id, "aliases") return biblio_dict
def test_extract_xml(self): page = self.TEST_XML dict_of_keylists = { 'count' : ['total_count']} response = provider._extract_from_xml(page, dict_of_keylists) assert_equals(response, {'count': 17})
def _extract_biblio_efetch(self, page, id=None): if "ArticleDate" in page: dict_of_keylists = {"year": ["PubmedArticleSet", "MedlineCitation", "Article", "ArticleDate", "Year"], "month": ["PubmedArticleSet", "MedlineCitation", "Article", "ArticleDate", "Month"], "day": ["PubmedArticleSet", "MedlineCitation", "Article", "ArticleDate", "Day"], "title": ["PubmedArticleSet", "MedlineCitation", "Article", "ArticleTitle"], "abstract": ["PubmedArticleSet", "MedlineCitation", "Article", "Abstract", "AbstractText"], "issn": ["PubmedArticleSet", "MedlineCitation", "Article", "Journal", "ISSN"], "journal": ["PubmedArticleSet", "MedlineCitation", "Article", "Journal", "Title"], } else: dict_of_keylists = {"year": ["PubmedArticleSet", "MedlineCitation", "Article", "PubDate", "Year"], "month": ["PubmedArticleSet", "MedlineCitation", "Article", "PubDate", "Month"], "day": ["PubmedArticleSet", "MedlineCitation", "Article", "PubDate", "Day"], "title": ["PubmedArticleSet", "MedlineCitation", "Article", "ArticleTitle"], "abstract": ["PubmedArticleSet", "MedlineCitation", "Article", "Abstract", "AbstractText"], "issn": ["PubmedArticleSet", "MedlineCitation", "Article", "Journal", "ISSN"], "journal": ["PubmedArticleSet", "MedlineCitation", "Article", "Journal", "Title"], } biblio_dict = provider._extract_from_xml(page, dict_of_keylists) dom_authors = provider._find_all_in_xml(page, "LastName") try: biblio_dict["authors"] = ", ".join([author.firstChild.data for author in dom_authors]) except (AttributeError, TypeError): pass mesh_list = provider._find_all_in_xml(page, "DescriptorName") try: if mesh_list: biblio_dict["keywords"] = "; ".join([mesh_term.firstChild.data for mesh_term in mesh_list]) except (AttributeError, TypeError): pass try: biblio_dict["issn"] = biblio_dict["issn"].replace("-", "") except (AttributeError, KeyError): pass try: datetime_published = datetime.datetime(year=biblio_dict["year"], month=biblio_dict["month"], day=biblio_dict["day"]) biblio_dict["date"] = datetime_published.isoformat() biblio_dict["year"] = re.sub("\D", "", str(biblio_dict["year"])) del biblio_dict["month"] del biblio_dict["day"] except (AttributeError, TypeError, KeyError): logger.debug(u"%20s don't have full date information %s" % (self.provider_name, id)) pass try: biblio_dict["year"] = str(biblio_dict["year"]) except (KeyError): pass return biblio_dict
def _extract_aliases_from_doi(self, page, doi): dict_of_keylists = {"pmid": ["eSearchResult", "IdList", "Id"], "QueryTranslation": ["eSearchResult", "QueryTranslation"]} aliases_dict = provider._extract_from_xml(page, dict_of_keylists) aliases_list = [] if aliases_dict: if aliases_dict["QueryTranslation"] == (doi + "[All Fields]"): aliases_list = [("pmid", str(aliases_dict["pmid"]))] return aliases_list
def _extract_biblio(self, page, id=None): dict_of_keylists = { 'title' : ['Slideshow', 'Title'], 'username' : ['Slideshow', 'Username'], 'created' : ['Slideshow', 'Created'], } self._sanity_check_page(page) biblio_dict = provider._extract_from_xml(page, dict_of_keylists) biblio_dict["repository"] = "Slideshare" return biblio_dict
def _extract_aliases(self, page, id=None): dict_of_keylists = {'title': ['Slideshow', 'Title']} self._sanity_check_page(page) aliases_dict = provider._extract_from_xml(page, dict_of_keylists) if aliases_dict: aliases_list = [(namespace, nid) for (namespace, nid) in aliases_dict.iteritems()] else: aliases_list = [] return aliases_list
def _extract_biblio(self, page, id=None): dict_of_keylists = { 'title': ['Slideshow', 'Title'], 'username': ['Slideshow', 'Username'], 'created': ['Slideshow', 'Created'], } self._sanity_check_page(page) biblio_dict = provider._extract_from_xml(page, dict_of_keylists) biblio_dict["repository"] = "Slideshare" return biblio_dict
def _extract_aliases(self, page, id=None): dict_of_keylists = { 'title' : ['Slideshow', 'Title'] } self._sanity_check_page(page) aliases_dict = provider._extract_from_xml(page, dict_of_keylists) if aliases_dict: aliases_list = [(namespace, nid) for (namespace, nid) in aliases_dict.iteritems()] else: aliases_list = [] return aliases_list
def _extract_biblio(self, page, id=None): dict_of_keylists = {"year": ["PubmedArticleSet", "MedlineCitation", "Article", "Journal", "PubDate", "Year"], "title": ["PubmedArticleSet", "MedlineCitation", "Article", "ArticleTitle"], "journal": ["PubmedArticleSet", "MedlineCitation", "Article", "Journal", "Title"], } biblio_dict = provider._extract_from_xml(page, dict_of_keylists) dom_authors = provider._find_all_in_xml(page, "LastName") try: biblio_dict["authors"] = ", ".join([author.firstChild.data for author in dom_authors]) except (AttributeError, TypeError): pass return biblio_dict
def _extract_aliases(self, page, id=None): dict_of_keylists = {'url': ['url']} aliases_dict = provider._extract_from_xml(page, dict_of_keylists) try: doi = provider.doi_from_url_string(aliases_dict["url"]) if doi: aliases_dict["doi"] = doi except KeyError: pass if aliases_dict: aliases_list = [(namespace, nid) for (namespace, nid) in aliases_dict.iteritems()] else: aliases_list = [] return aliases_list
def _extract_metrics(self, page, status_code=200, id=None): if status_code != 200: if status_code == 404: return {} else: raise(self._get_error(status_code)) self._sanity_check_page(page) dict_of_keylists = { 'slideshare:downloads' : ['Slideshow', 'NumDownloads'], 'slideshare:views' : ['Slideshow', 'NumViews'], 'slideshare:comments' : ['Slideshow', 'NumComments'], 'slideshare:favorites' : ['Slideshow', 'NumFavorites'], } metrics_dict = provider._extract_from_xml(page, dict_of_keylists) for mykey in metrics_dict: metrics_dict[mykey] = int( metrics_dict[mykey]) return metrics_dict
def _extract_metrics(self, page, status_code=200, id=None): if status_code != 200: if status_code == 404: return {} else: raise (self._get_error(status_code)) self._sanity_check_page(page) dict_of_keylists = { 'slideshare:downloads': ['Slideshow', 'NumDownloads'], 'slideshare:views': ['Slideshow', 'NumViews'], 'slideshare:comments': ['Slideshow', 'NumComments'], 'slideshare:favorites': ['Slideshow', 'NumFavorites'], } metrics_dict = provider._extract_from_xml(page, dict_of_keylists) for mykey in metrics_dict: metrics_dict[mykey] = int(metrics_dict[mykey]) return metrics_dict
def _extract_aliases(self, page, id=None): dict_of_keylists = { 'url' : ['url'] } aliases_dict = provider._extract_from_xml(page, dict_of_keylists) try: doi = provider.doi_from_url_string(aliases_dict["url"]) if doi: aliases_dict["doi"] = doi except KeyError: pass if aliases_dict: aliases_list = [(namespace, nid) for (namespace, nid) in aliases_dict.iteritems()] else: aliases_list = [] return aliases_list
def _extract_metrics(self, page, status_code=200, id=None): if status_code != 200: if status_code == 404: return {} else: raise(self._get_error(status_code)) if not "links_getStats_response" in page: raise ProviderContentMalformedError dict_of_keylists = { 'facebook:likes' : ['share_count'], 'facebook:shares' : ['like_count'], 'facebook:comments' : ['comment_count'], 'facebook:clicks' : ['click_count'] } metrics_dict = provider._extract_from_xml(page, dict_of_keylists) return metrics_dict
def _extract_biblio(self, page, id=None): dict_of_keylists = { 'title' : ['entry', 'title'], 'date' : ['entry', 'published'], } biblio_dict = provider._extract_from_xml(page, dict_of_keylists) dom_authors = provider._find_all_in_xml(page, "name") try: authors = [author.firstChild.data for author in dom_authors] biblio_dict["authors"] = ", ".join([author.split(" ")[-1] for author in authors]) except (AttributeError, TypeError): pass try: biblio_dict["year"] = biblio_dict["date"][0:4] except KeyError: pass biblio_dict["repository"] = "arXiv" biblio_dict["free_fulltext_url"] = self._get_templated_url(self.aliases_url_template, id, "aliases") return biblio_dict
def test_extract_xml(self): page = self.TEST_XML dict_of_keylists = {'count': ['total_count']} response = provider._extract_from_xml(page, dict_of_keylists) assert_equals(response, {'count': 17})