def test_repeating(self, datadir, event_loop): set_config({'unify_booktitle': [ [r'(?P<prefix>.*) remove(?P<suffix>.*)', r'{prefix}{suffix}', 'priority:50', 'repeat', 'kind:plain'], ]}) unify_me = make_entry( {'booktitle': ('Proceedings remove of remove some remove conference')}) u = Unifier() sugg = u.unify_entry(unify_me) assert (sugg.data['booktitle'][0] == ((r'Proceedings of some conference'), Suggestion.KIND_PLAIN)) # Test repeat-unifying suggestion sugg = Suggestion('test', unify_me) sugg.add_field('booktitle', 'Proceedings remove of remove some remove conference') u.unify_suggestion(sugg) assert(sugg.data['booktitle'][0] == ('Proceedings of some conference', Suggestion.KIND_PLAIN))
def test_chaining(self, datadir, event_loop): set_config({'unify_booktitle': [ [r'(?P<prefix>.*)first(?P<suffix>.*)', r'{prefix}1st{suffix}', 'kind:plain', 'priority:50'], [r'(?P<prefix>.*) IEEE(?P<suffix>.*)', r'{prefix}{suffix}', 'kind:regex'] ]}) unify_me = make_entry( {'booktitle': ('Proceedings of the first IEEE conference on whatever')}) u = Unifier() sugg = u.unify_entry(unify_me) assert (sugg.data['booktitle'][0] == ((r'Proceedings of the 1st conference on whatever'), Suggestion.KIND_RE)) # Test chain-unifying suggestion sugg = Suggestion('test', unify_me) sugg.add_field('booktitle', ('Proceedings of the first' ' IEEE conference on whatever')) u.unify_suggestion(sugg) assert(sugg.data['booktitle'][0] == (r'Proceedings of the 1st conference on whatever', Suggestion.KIND_RE))
def test_unify_suggestion(self, datadir, event_loop): set_config({'unify_booktitle': [ [r'\d{4} IEEE (?P<name>[^\(]*) \((?P<short>[^\)]*)\)', r'Proceedings of the \d*(th|st|nd|rd) {name} \({short}.*\)'], ]}) dummy_entry = make_entry({}) sugg = Suggestion('test', dummy_entry) sugg.add_field('booktitle', ('2016 IEEE International Parallel and' ' Distributed Processing Symposium (IPDPS)')) u = Unifier() u.unify_suggestion(sugg) assert (sugg.data['booktitle'][0] == (r'Proceedings of the \d*(th|st|nd|rd)' r' International Parallel and Distributed' r' Processing Symposium \(IPDPS.*\)', Suggestion.KIND_RE))
def _query_blocking(self, entry, provider): isbn = entry.data.get('isbn') # Okay, we're actually going to make a HTTP request self._ratelimit.get() if not isbn: self._ui.finish_subtask('ISBNQuery') return None if notisbn(isbn): self._ui.finish_subtask('ISBNQuery') return (None, "{} is not a valid ISBN.".format(isbn)) try: bibtex_data = self._formatter(meta(isbn, service=provider)) except ISBNLibException as e: self._ui.finish_subtask('ISBNQuery') return (None, e) except socket.timeout: self._ui.finish_subtask('ISBNQuery') raise RetrievalProblem("Socket timeout during" " ISBN metadata retrieval") try: parsed_data = bibtexparser.loads(bibtex_data) except: self._ui.finish_subtask('ISBNQuery') raise RetrievalProblem("Data from ISBN source could not be parsed") if len(parsed_data.entries) != 1: self._ui.finish_subtask('ISBNQuery') raise RetrievalProblem( "ISBN search did not return exactly one result.") retrieved = Entry(parsed_data.entries[0], self._ui) s = Suggestion("isbn_{}".format(provider), entry) for (k, v) in retrieved.data.items(): if k.lower() == 'id': continue s.add_field(k, v) for (first, last) in s.authors: s.add_author(first, last) for (first, last) in s.editors: s.add_editor(first, last) return (s, None)
def test_re_suggestion(self, datadir): e = make_entry({ 'title': 'This is some title.', 'booktitle': "Proceedings of the 20th Conference on Something Awesome (CSA'20)" }) s = Suggestion('test', e) s.add_field('title', 'This is some title.', kind=Suggestion.KIND_RE) s.add_field('booktitle', r'Proceedings of the \d+(th|st|rd|nd) .* \(.*\)', kind=Suggestion.KIND_RE) d = Differ(e) result = d.diff(s) assert result == [] s = Suggestion('nonmatching_test', e) s.add_field('booktitle', r'Nope', kind=Suggestion.KIND_RE) result = d.diff(s) assert len(result) == 1
def test_list_ignore_order(self, datadir): e = make_entry({ 'title': 'This is some title.', 'issn': '1234-5678, 2345-67890' }) s = Suggestion('test', e) s.add_field('title', 'This is some title.') s.add_field('issn', '2345-67890, 1234-5678') d = Differ(e) result = d.diff(s) assert result == []
async def _execute_query(self, entry, url, retry_number=0): if not url: self._ui.finish_subtask('MetaQuery') return None # Okay, we're actually going to make a HTTP request await self._ratelimit.get() try: async with aiohttp.ClientSession() as session: async with session.get(url, headers=MetaSource.HEADERS) as resp: status = resp.status if status == 403: try: html = await resp.text() if self._detect_captcha(html): self._ui.finish_subtask('MetaQuery') LOGGER.info( (f"URL {url} requires a captcha to " "be solved. Giving up.")) raise RetrievalProblem( (f"URL {url} requires a " "captcha to be solved.") ) except: pass if retry_number == self._max_retries: self._ui.finish_subtask('MetaQuery') raise RetrievalProblem( (f"URL {url} still results in 403 " f"after {self._max_retries} retries." " Giving up.")) LOGGER.debug((f"Got a 403 while accessing {url}." f" Backing off. " f"Retry {retry_number+1}...")) await self._ratelimit.backoff() await asyncio.sleep(self._retry_pause) return await self._execute_query(entry, url, retry_number+1) if status != 200: self._ui.finish_subtask('MetaQuery') raise RetrievalProblem( "Accessing URL {} returns status {}" .format(url, status)) try: html = await resp.text() except UnicodeDecodeError: self._ui.finish_subtask('MetaQuery') raise RetrievalProblem( f"Content at URL {url} could not be interpreted") parser = MetadataHTMLParser(self._ui, str(resp.url)) parser.feed(html) sugg = Suggestion("meta", entry) for (k, v) in parser.get_metadata().items(): if isinstance(v, list): sugg.add_field(k, [remove_tags(vi) for vi in v]) else: sugg.add_field(k, remove_tags(v)) for (first, last) in parser.get_authors(): sugg.add_author(first, last) self._ui.finish_subtask('MetaQuery') return sugg except asyncio.TimeoutError: self._ui.finish_subtask('MetaQuery') LOGGER.error(f"Timeout trying to retrieve URL {url}") raise RetrievalProblem( f"Timeout trying to retrieve URL {url}")
def _query_blocking(self, entry): doi = entry.get_probable_doi() if not doi: self._ui.finish_subtask('CrossrefQuery') return None try: data = crossref_commons.retrieval.get_publication_as_json(doi) except ValueError as e: self._ui.finish_subtask('CrossrefQuery') if str(e) == f"DOI {doi} does not exist": # This isn't really an error, CrossRef just does not know # about them pass else: LOGGER.error((f"Error retrieving data for {entry.get_id()}. " f"{e}")) return None except ConnectionError as e: # TODO retry? self._ui.finish_subtask('CrossrefQuery') LOGGER.error( (f"Connection error retrieving data for {entry.get_id()}. " f"{e}")) return None s = Suggestion("crossref", entry) # Special handling for type btype = TYPE_MAPPING.get(data['type']) if not btype: LOGGER.warn( "Type {} not found in crossref source. (Entry {})".format( data['type'], entry.get_id())) else: s.add_field('entrytype', btype) # Special handling for authors for author_data in data.get('author', []): s.add_author( author_data.get('given', "").strip(), author_data.get('family', "").strip()) # Special handling for editors for editor_data in data.get('editor', []): s.add_editor( editor_data.get('given', "").strip(), editor_data.get('family', "").strip()) # Special handling for journal / book title if btype in ['journal-article', 'book-chapter']: journal = flexistrip(data.get('container-title')) if journal: s.add_field('journal', journal) # Special handling for URL. Only take it if it's not a DOI-Url url = flexistrip(data.get('URL')) if url and (CrossrefSource.DOI_URL_RE.match(url) is None): s.add_field('url', url) # All other fields for field_from, field_to in FIELD_MAPPING.items(): if isinstance(field_to, dict): if entry.data['entrytype'] in field_to: field_to = field_to[entry.data['entrytype']] else: field_to = field_to.get('default') if not field_to: continue if field_from in data: s.add_field(field_to, flexistrip(data[field_from])) self._ui.finish_subtask('CrossrefQuery') return s
def _query_blocking(self, entry): doi = entry.get_probable_doi() if not doi: self._ui.finish_subtask('DataCiteQuery') return None # Okay, we're actually going to make a HTTP request self._ratelimit.get() url = "https://api.datacite.org/dois/{}".format( urllib.parse.quote(doi)) response = requests.get(url) if response.status_code != 200: self._ui.finish_subtask('DataCiteQuery') return None try: data = response.json() except ValueError: LOGGER.warn("Response did not contain JSON") self._ui.finish_subtask('DataCiteQuery') return None if 'errors' in data: self._ui.finish_subtask('DataCiteQuery') return None attrs = data['data']['attributes'] s = Suggestion('datacite', entry) # Authors for i in range(0, len(attrs['creators'])): adata = attrs['creators'][i] if 'givenName' in adata and 'familyName' in adata: s.add_author(adata['givenName'], adata['familyName']) # Editors for i in range(0, len(attrs['contributors'])): adata = attrs['contributors'][i] if adata.get('contributorType') == 'Editor': if 'givenName' in adata and 'familyName' in adata: s.add_editor(adata['givenName'], adata['familyName']) # Title…s? # TODO what happens if there are multiple titles? if path_exists(attrs, ('titles', 0, 'title')): s.add_field('title', attrs['titles'][0]['title']) if 'publisher' in attrs: s.add_field('publisher', attrs['publisher']) if 'publicationYear' in attrs: s.add_field('year', attrs['publicationYear']) if 'url' in attrs: s.add_field('url', attrs['url']) ctype = None if path_exists(attrs, ('container', 'type')): ctype = attrs['container']['type'] cdata = attrs['container'] if ctype == 'Journal': if 'title' in cdata: s.add_field('journal', cdata['title']) elif ctype == 'Book Series': if 'title' in cdata: s.add_field('booktitle', cdata['title']) if ctype in ('Journal', 'Book Series'): if 'volume' in cdata: s.add_field('volume', cdata['volume']) if 'issue' in cdata: s.add_field('issue', cdata['issue']) if cdata.get('identifierType') == 'ISSN': s.add_field('issn', cdata['identifier']) if 'firstPage' in cdata and 'lastPage' in cdata: s.add_field( 'pages', '{}--{}'.format(cdata['firstPage'], cdata['lastPage'])) if path_exists(attrs, ('type', 'bibtex')): s.add_field('ENTRYTYPE', attrs['type']['bibtex']) self._ui.finish_subtask('DataCiteQuery') return s