def get_bibtex(noorlib_url): """Get bibtex file content from a noormags url. Return as string.""" pagetext = request(noorlib_url).text article_id = BIBTEX_ARTICLE_ID_SEARCH(pagetext)[0] url = 'http://www.noorlib.ir/View/HttpHandler/CitationHandler.ashx?id=' +\ article_id + '&format=BibTex' return request(url).text
def get_bibtex(noorlib_url): """Get bibtex file content from a noormags url. Return as string.""" pagetext = request(noorlib_url).text article_id = BIBTEX_ARTICLE_ID_SEARCH(pagetext)[0] url = 'http://www.noorlib.ir/View/HttpHandler/CitationHandler.ashx?id=' +\ article_id + '&format=BibTex' return request(url).text
def get_ris(noorlib_url): # This is copied from noormags module (currently not supported but may # be)[1] """Get ris file content from a noormags url. Return as string.""" pagetext = request(noorlib_url).text article_id = RIS_ARTICLE_ID_SEARCH(pagetext)[0] url = 'http://www.noormags.ir/view/CitationHandler.ashx?format=RIS&id=' +\ article_id return request(url).text
def get_ris(noorlib_url): # This is copied from noormags module (currently not supported but may # be)[1] """Get ris file content from a noormags url. Return as string.""" pagetext = request(noorlib_url).text article_id = RIS_ARTICLE_ID_SEARCH(pagetext)[0] url = 'http://www.noormags.ir/view/CitationHandler.ashx?format=RIS&id=' +\ article_id return request(url).text
def get_citoid_dict(isbn) -> Optional[dict]: # https://www.mediawiki.org/wiki/Citoid/API r = request( 'https://en.wikipedia.org/api/rest_v1/data/citation/mediawiki/' + isbn) if r.status_code != 200: return return r.json()[0]
def get_citoid_dict(isbn) -> Optional[dict]: # https://www.mediawiki.org/wiki/Citoid/API r = request( 'https://en.wikipedia.org/api/rest_v1/data/citation/mediawiki/' + isbn) if r.status_code != 200: return return r.json()[0]
def ottobib(isbn): """Convert ISBN to bibtex using ottobib.com.""" m = OTTOBIB_SEARCH( request('http://www.ottobib.com/isbn/' + isbn + '/bibtex').content.decode()) if m is not None: return m[1]
def googlebooks_scr(parsed_url, date_format='%Y-%m-%d') -> tuple: """Create the response namedtuple.""" parsed_query = parse_qs(parsed_url.query) id_ = parsed_query.get('id') if id_ is not None: volume_id = id_[0] else: # the new URL format volume_id = parsed_url.path.rpartition('/')[2] dictionary = ris_parse( request( f'https://{parsed_url.netloc}/books/download/?id={volume_id}' f'&output=ris', spoof=True).content.decode('utf8')) dictionary['date_format'] = date_format # manually adding page number to dictionary: pg = parsed_query.get('pg') if pg is not None: pg0 = pg[0] dictionary['page'] = pg0[2:] dictionary['url'] += f'&pg={pg0}' # although google does not provide a language field: if not dictionary['language']: dictionary['language'] = classify(dictionary['title'])[0] return dict_to_sfn_cit_ref(dictionary)
def get_html(url: str) -> str: """Return the html string for the given url.""" with request(url, stream=True, spoof=True) as r: check_response_headers(r) content = next(r.iter_content(MAX_RESPONSE_LENGTH)) charset_match = CHARSET(content) return content.decode( charset_match[1].decode() if charset_match else r.encoding)
def get_html(url: str) -> str: """Return the html string for the given url.""" with request( url, stream=True, spoof=True ) as r: check_response_headers(r) content = next(r.iter_content(MAX_RESPONSE_LENGTH)) charset_match = CHARSET(content) return content.decode( charset_match[1].decode() if charset_match else r.encoding)
def get_ris(googlebook_url): """Get ris file content from a noormags url.""" # getting id: pu = urlparse(googlebook_url) pq = parse_qs(pu.query) bookid = pq['id'][0] url = 'http://books.google.com/books/download/?id=' +\ bookid + '&output=ris' # Agent spoofing is needed, otherwise: HTTP Error 401: Unauthorized return request(url, spoof=True).text
def url2dictionary(ketabir_url: str) -> Optional[dict]: try: # Try to see if ketabir is available, # ottobib should continoue its work in isbn.py if it is not. r = request(ketabir_url) except RequestException: logger.exception(ketabir_url) return html = r.content.decode('utf-8') d = defaultdict(lambda: None, cite_type='book') d['title'] = TITLE_SEARCH(html)[1] # initiating name lists: others = [] authors = [] editors = [] translators = [] # building lists: for role, name in AUTHORS_FINDALL(html): if role == 'نويسنده': authors.append(first_last(name)) elif role == 'مترجم': translators.append(first_last(name)) elif role == 'ويراستار': editors.append(first_last(name)) else: others.append(('', name + ' (' + role + ')')) if authors: d['authors'] = authors if others: d['others'] = others if editors: d['editors'] = editors if translators: d['translators'] = translators m = PUBLISHER_SEARCH(html) if m: d['publisher'] = m[1] m = DATE_SEARCH(html) if m: if LANG != 'fa': d['month'] = m['month'] d['year'] = '۱۳' + m['year'] else: d['month'] = m['month'] d['year'] = '۱۳' + m['year'] m = ISBN_SEARCH(html) if m: d['isbn'] = m[1] m = VOLUME_SEARCH(html) if m: d['volume'] = m[1] m = LOCATION_SEARCH(html) if m: d['publisher-location'] = m[1] return d
def url2dictionary(ketabir_url: str) -> Optional[dict]: try: # Try to see if ketabir is available, # ottobib should continoue its work in isbn.py if it is not. r = request(ketabir_url) except RequestException: logger.exception(ketabir_url) return html = r.content.decode('utf-8') d = defaultdict(lambda: None, cite_type='book') d['title'] = TITLE_SEARCH(html)[1] # initiating name lists: others = [] authors = [] editors = [] translators = [] # building lists: for role, name in AUTHORS_FINDALL(html): if role == 'نويسنده': authors.append(first_last(name)) elif role == 'مترجم': translators.append(first_last(name)) elif role == 'ويراستار': editors.append(first_last(name)) else: others.append(('', f'{name} ({role})')) if authors: d['authors'] = authors if others: d['others'] = others if editors: d['editors'] = editors if translators: d['translators'] = translators m = PUBLISHER_SEARCH(html) if m: d['publisher'] = m[1] m = DATE_SEARCH(html) if m: if LANG != 'fa': d['month'] = m['month'] d['year'] = '۱۳' + m['year'] else: d['month'] = m['month'] d['year'] = '۱۳' + m['year'] m = ISBN_SEARCH(html) if m: d['isbn'] = m[1] m = VOLUME_SEARCH(html) if m: d['volume'] = m[1] m = LOCATION_SEARCH(html) if m: d['publisher-location'] = m[1] return d
def jstor_scr(url: str, date_format: str = '%Y-%m-%d') -> tuple: open_access = [] thread = Thread(target=is_open_access, args=(url, open_access)) thread.start() id_ = urlparse(url).path.rpartition('/')[2] bibtex = request('https://www.jstor.org/citation/text/' + id_).content.decode('utf8') dictionary = bibtex_parse(bibtex) dictionary['jstor'] = id_ dictionary['date_format'] = date_format thread.join() if open_access: dictionary['jstor-access'] = 'free' return dict_to_sfn_cit_ref(dictionary)
def oclc_scr(oclc: str, date_format: str = '%Y-%m-%d') -> tuple: text = request('https://www.worldcat.org/oclc/' + oclc + '?page=endnote' '&client=worldcat.org-detailed_record').content.decode() if '<html' in text: # invalid OCLC number return ('Error processing OCLC number: ' + oclc, 'Perhaps you entered an invalid OCLC number?', '') d = ris_parse(text) authors = d['authors'] if authors: # worldcat has a '.' the end of the first name d['authors'] = [( fn.rstrip('.') if not fn.isupper() else fn, ln.rstrip('.') if not ln.isupper() else ln, ) for fn, ln in authors] d['date_format'] = date_format d['oclc'] = oclc d['title'] = d['title'].rstrip('.') return dict_to_sfn_cit_ref(d)
def get_crossref_dict(doi) -> defaultdict: """Return the parsed data of crossref.org for the given DOI.""" # See https://github.com/CrossRef/rest-api-doc/blob/master/api_format.md # for documentation. # Force using the version 1 of the API to prevent breakage. See: # https://github.com/CrossRef/rest-api-doc/blob/master/rest_api.md#how-to-manage-api-versions j = request('http://api.crossref.org/v1/works/' + doi).json() assert j['status'] == 'ok' d = defaultdict(lambda: None, {k.lower(): v for k, v in j['message'].items()}) d['cite_type'] = d.pop('type') for field in ('title', 'container-title', 'issn', 'isbn'): value = d[field] if value: d[field] = value[0] date = d['issued']['date-parts'][0] date_len = len(date) if date_len == 3: d['date'] = datetime_date(*date) elif date_len == 2: d['year'], d['month'] = str(date[0]), str(date[1]) else: year = date[0] # date can be of the form [None] # https://github.com/CrossRef/rest-api-doc/issues/169 if year: d['year'] = str(date[0]) extract_names(d, 'author', 'authors') extract_names(d, 'editor', 'editors') extract_names(d, 'translator', 'translators') page = d['page'] if page: d['page'] = page.replace('-', '–') return d
def oclc_sfn_cit_ref(oclc: str, date_format: str = '%Y-%m-%d') -> tuple: text = request( 'https://www.worldcat.org/oclc/' + oclc + '?page=endnote' '&client=worldcat.org-detailed_record').text if '<html' in text: # invalid OCLC number return ( 'Error processing OCLC number: ' + oclc, 'Perhaps you entered an invalid OCLC number?', '') d = ris_parse(text) authors = d['authors'] if authors: # worldcat has a '.' the end of the first name d['authors'] = [( fn.rstrip('.') if not fn.isupper() else fn, ln.rstrip('.') if not ln.isupper() else ln, ) for fn, ln in authors] d['date_format'] = date_format d['oclc'] = oclc d['title'] = d['title'].rstrip('.') return dict_to_sfn_cit_ref(d)
def get_home_title(url: str, home_title_list: List[str]) -> None: """Get homepage of the url and return it's title. home_title_list will be used to return the thread result. This function is invoked through a thread. """ # Todo: cache the result. home_url = '://'.join(urlparse(url)[:2]) with request( home_url, spoof=True, stream=True ) as r: try: check_response_headers(r) except ( RequestException, StatusCodeError, ContentTypeError, ContentLengthError, ): return content = next(r.iter_content(MAX_RESPONSE_LENGTH)) m = CHARSET(content) html = content.decode(m[1].decode() if m else r.encoding) m = TITLE_TAG(html) title = html_unescape(m['result']) if m else None home_title_list.append(title)
def get_home_title(url: str, home_title_list: List[str]) -> None: """Get homepage of the url and return it's title. home_title_list will be used to return the thread result. This function is invoked through a thread. """ # Todo: cache the result. home_url = '://'.join(urlparse(url)[:2]) with request(home_url, spoof=True, stream=True) as r: try: check_response_headers(r) except ( RequestException, StatusCodeError, ContentTypeError, ContentLengthError, ): return content = next(r.iter_content(MAX_RESPONSE_LENGTH)) m = CHARSET(content) html = content.decode(m[1].decode() if m else r.encoding) m = TITLE_TAG(html) title = html_unescape(m['result']) if m else None home_title_list.append(title)
def ncbi(type_: str, id_: str) -> defaultdict: """Return the NCBI data for the given id_.""" # According to https://www.ncbi.nlm.nih.gov/pmc/tools/get-metadata/ if type_ == 'pmid': json_response = request(PUBMED_URL + id_).json() else: # type_ == 'pmcid' json_response = request(PMC_URL + id_).json() if 'error' in json_response: # Example error message if rates are exceeded: # {"error":"API rate limit exceeded","count":"11"} # https://www.ncbi.nlm.nih.gov/books/NBK25497/#chapter2.Coming_in_May_2018_API_Keys # Return a 503 Service Unavailable raise NCBIError(json_response) result_get = json_response['result'][id_].get d = defaultdict(lambda: None) doi = None articleids = result_get('articleids', ()) for articleid in articleids: idtype = articleid['idtype'] if idtype == 'doi': doi = articleid['value'] crossref_dict = {} crossref_thread = Thread(target=crossref_update, args=(crossref_dict, doi)) crossref_thread.start() d['doi'] = doi elif idtype == 'pmcid': # Use NON_DIGITS_SUB to remove the PMC prefix e.g. in PMC3539452 d['pmcid'] = NON_DIGITS_SUB('', articleid['value']) elif idtype == 'pubmed': d['pmid'] = articleid['value'] else: d[idtype] = articleid['value'] d['issn'] = result_get('issn') or result_get('essn') # essn is eissn d['cite_type'] = result_get('pubtype', ('journal', ))[0] d['booktitle'] = result_get('booktitle') or result_get('bookname') d['edition'] = result_get('edition') d['publisher-location'] = result_get('publisherlocation') d['publisher'] = result_get('publishername') d['url'] = result_get('availablefromurl') d['chapter'] = result_get('chapter') date = result_get('pubdate') or result_get('epubdate') \ or result_get('printpubdate') date_split = date.split(' ') date_len = len(date_split) if date_len == 3: d['date'] = datetime.strptime(date, '%Y %b %d') elif date_len == 2: d['year'], d['month'] = \ date_split[0], str(b_TO_NUM[date_split[1].lower()]) else: d['year'] = date authors = [] authors_append = authors.append for author in result_get('authors', ()): if author['authtype'] != 'Author': continue parts = author['name'].split() for i, p in enumerate(parts): if p.isupper(): last = ' '.join(parts[:i]) first = ' '.join(parts[i:]) break else: last = ' '.join(parts[:-1]) first = parts[-1] authors_append((first, last)) d['authors'] = authors d['journal'] = result_get('fulljournalname') or result_get('source') for field in ('title', 'volume', 'issue'): d[field] = result_get(field) d['page'] = result_get('pages', '').replace('-', '–') lang = result_get('lang') if lang: d['language'] = lang[0] if doi: # noinspection PyUnboundLocalVariable crossref_thread.join() # noinspection PyUnboundLocalVariable d.update(crossref_dict) return d
def get_bibtex(noormags_url): """Get BibTex file content from a noormags_url. Return as string.""" page_text = request(noormags_url).text article_id = BIBTEX_ARTICLE_ID_SEARCH(page_text)[0] url = 'http://www.noormags.ir/view/fa/citation/bibtex/' + article_id return request(url).text
def get_ris(noormags_url): """Get ris file content from a noormags url. Return as string.""" page_text = request(noormags_url).text article_id = RIS_ARTICLE_ID_SEARCH(page_text)[0] return request( 'http://www.noormags.ir/view/fa/citation/ris/' + article_id).text
def is_open_access(url: str, result: list): if '"openAccess" : "True"' in request(url, spoof=True).text: result.append(True)
def ncbi(type_: str, id_: str) -> defaultdict: """Return the NCBI data for the given id_.""" # According to https://www.ncbi.nlm.nih.gov/pmc/tools/get-metadata/ if type_ == 'pmid': json_response = request(PUBMED_URL + id_).json() else: # type_ == 'pmcid' json_response = request(PMC_URL + id_).json() if 'error' in json_response: # Example error message if rates are exceeded: # {"error":"API rate limit exceeded","count":"11"} # https://www.ncbi.nlm.nih.gov/books/NBK25497/#chapter2.Coming_in_May_2018_API_Keys # Return a 503 Service Unavailable raise NCBIError(json_response) result_get = json_response['result'][id_].get d = defaultdict(lambda: None) doi = None articleids = result_get('articleids', ()) for articleid in articleids: idtype = articleid['idtype'] if idtype == 'doi': doi = articleid['value'] crossref_dict = {} crossref_thread = Thread( target=crossref_update, args=(crossref_dict, doi)) crossref_thread.start() d['doi'] = doi elif idtype == 'pmcid': # Use NON_DIGITS_SUB to remove the PMC prefix e.g. in PMC3539452 d['pmcid'] = NON_DIGITS_SUB('', articleid['value']) elif idtype == 'pubmed': d['pmid'] = articleid['value'] else: d[idtype] = articleid['value'] d['issn'] = result_get('issn') or result_get('essn') # essn is eissn d['cite_type'] = result_get('pubtype', ('journal',))[0] d['booktitle'] = result_get('booktitle') or result_get('bookname') d['edition'] = result_get('edition') d['publisher-location'] = result_get('publisherlocation') d['publisher'] = result_get('publishername') d['url'] = result_get('availablefromurl') d['chapter'] = result_get('chapter') date = result_get('pubdate') or result_get('epubdate') \ or result_get('printpubdate') date_split = date.split(' ') date_len = len(date_split) if date_len == 3: d['date'] = datetime.strptime(date, '%Y %b %d') elif date_len == 2: d['year'], d['month'] = \ date_split[0], str(b_TO_NUM[date_split[1].lower()]) else: d['year'] = date authors = [] authors_append = authors.append for author in result_get('authors', ()): if author['authtype'] != 'Author': continue parts = author['name'].split() for i, p in enumerate(parts): if p.isupper(): last = ' '.join(parts[:i]) first = ' '.join(parts[i:]) break else: last = ' '.join(parts[:-1]) first = parts[-1] authors_append((first, last)) d['authors'] = authors d['journal'] = result_get('fulljournalname') or result_get('source') for field in ('title', 'volume', 'issue'): d[field] = result_get(field) d['page'] = result_get('pages', '').replace('-', '–') lang = result_get('lang') if lang: d['language'] = lang[0] if doi: # noinspection PyUnboundLocalVariable crossref_thread.join() # noinspection PyUnboundLocalVariable d.update(crossref_dict) return d
def ottobib(isbn): """Convert ISBN to bibtex using ottobib.com.""" m = OTTOBIB_SEARCH( request('http://www.ottobib.com/isbn/' + isbn + '/bibtex').text) if m: return m[1]
def ottobib(isbn): """Convert ISBN to bibtex using ottobib.com.""" m = OTTOBIB_SEARCH( request('http://www.ottobib.com/isbn/' + isbn + '/bibtex').text) if m: return m[1]
def get_bibtex(noormags_url): """Get BibTex file content from a noormags_url. Return as string.""" page_text = request(noormags_url).text article_id = BIBTEX_ARTICLE_ID_SEARCH(page_text)[0] url = 'http://www.noormags.ir/view/fa/citation/bibtex/' + article_id return request(url).text
def get_ris(noormags_url): """Get ris file content from a noormags url. Return as string.""" page_text = request(noormags_url).text article_id = RIS_ARTICLE_ID_SEARCH(page_text)[0] return request('http://www.noormags.ir/view/fa/citation/ris/' + article_id).text