def fetch(self): doi_str = (doi.find_doi_in_text(self.uri) or doi.find_doi_in_text( urllib.request.urlopen(self.uri).read().decode('utf-8')) or self.uri) ctx = self.fetch_from_doi(doi_str) if ctx: if ctx.data: self.ctx.data = ctx.data if ctx.files: self.ctx.files = ctx.files return self.get_files()
def test_find_doi_in_line() -> None: test_data = [ ('http://dx.doi.org/10.1063/1.881498', '10.1063/1.881498'), ('http://dx.doi.org/10.1063%2F1.881498', '10.1063/1.881498'), (2 * 'qer ' + 'var doi = "12345/12345.3"', '12345/12345.3'), (2 * 'qer ' + "var doi = '12345/12345.3';fas", '12345/12345.3'), (2 * 'qer ' + "var DoI = 12345%2F12345.3", '12345/12345.3'), (2 * 'qer ' + "var DoI : 12345%2F12345.3", '12345/12345.3'), ('http://scitation.org/doi/10.1063/1.881498', '10.1063/1.881498'), ('org/doi(10.1063/1.881498)', '10.1063/1.881498'), ('/scitation.org/doi/10.1063/1.881498?234saf=34', '10.1063/1.881498'), ('/scitation.org/doi/10.1063/1.88149 8?234saf=34', '10.1063/1.88149'), ('/scitation.org/doi/10.1063/1.uniau12?as=234', '10.1063/1.uniau12'), ('https://doi.org/10.1093/analys/anw053', '10.1093/analys/anw053'), ('http://.scitation.org/doi/10.1063/1.mart(88)1498?asdfwer', '10.1063/1.mart(88)1498'), ('@ibook{doi:10.1002/9780470125915.ch2,', '10.1002/9780470125915.ch2'), ('<rdf:Description rdf:about="" xmlns:dc="http://purl.org/dc/elements' '.1/"><dc:format>application/pdf</dc:format><dc:identifier>' 'doi:10.1063/1.5079474</dc:identifier></rdf:Description>', '10.1063/1.5079474'), ('<(DOI:10.1002/9780470915.CH2)/S/URI,', '10.1002/9780470915.CH2'), ('URL<(DOI:10.1002/9780470125915.CH2,', '10.1002/9780470125915.CH2'), (r'A<</S/URI/URI(https://doi.org/10.1016/j.comptc.2018.10.004)>>/' r'Border[0 0 0]/M(D:20181022082356+0530)/Rect[147.40158 594.36926' r'347.24957 605.36926]/Subtype/Link/Type/A', '10.1016/j.comptc.2018.10.004'), ('doi(10.1038/s41535-018-0103-6;)', '10.1038/s41535-018-0103-6'), ] for url, doi in test_data: assert find_doi_in_text(url) == doi
def _search_direct_url(self): """ Sci-Hub embeds papers in an iframe. This function finds the actual source url which looks something like https://moscow.sci-hub.io/.../....pdf. """ logger.debug('pinging {0}'.format(self.base_url)) ping = self.session.get(self.base_url, timeout=1, verify=False) if not ping.status_code == 200: logger.error('server {0} is down '.format(self.base_url)) return None logger.debug('server {0} is up'.format(self.base_url)) url = "{0}{1}".format(self.base_url, self.uri) logger.debug('scihub url {0}'.format(url)) res = self.session.get(url, verify=False) logger.debug('Scraping scihub site') logger.debug('trying to get doi') self.doi = doi.find_doi_in_text(res.content.decode('utf8')) or '' if self.doi: logger.info('found a doi candidate {0}'.format(self.doi)) s = BeautifulSoup(res.content, 'html.parser') iframe = s.find('iframe') if iframe: logger.debug('iframe found in scihub\'s html') return ( iframe.get('src') if not iframe.get('src').startswith('//') else 'https:' + iframe.get('src') )
def fetch(self) -> None: _doi = doi.find_doi_in_text(self.uri) if _doi is None: return None importer = Importer(uri=_doi) importer.fetch() self.ctx = importer.ctx
def get_doi(self) -> Optional[str]: if self.ctx.data and 'doi' in self.ctx.data: _doi = self.ctx.data['doi'] return str(_doi) if _doi else None soup = self._get_soup() self.logger.info('Trying to parse doi from url body...') if soup: return doi.find_doi_in_text(str(soup)) else: return None
def match(cls, uri: str) -> Optional[papis.downloaders.Downloader]: if doi.find_doi_in_text(uri): return Downloader(uri) else: return None
def fetch(self): importer = Importer(uri=doi.find_doi_in_text(self.uri)) importer.fetch() self.ctx = importer.ctx
def match(cls, uri): if doi.find_doi_in_text(uri): return Downloader(uri) else: return None
def get_doi(self): if 'doi' in self.ctx.data: return self.ctx.data['doi'] soup = self._get_soup() self.logger.info('trying to parse doi...') return doi.find_doi_in_text(str(soup))