Пример #1
0
def get_bibtex(noorlib_url):
    """Get bibtex file content from a noormags url. Return as string."""
    pagetext = request(noorlib_url).text
    article_id = BIBTEX_ARTICLE_ID_SEARCH(pagetext)[0]
    url = 'http://www.noorlib.ir/View/HttpHandler/CitationHandler.ashx?id=' +\
          article_id + '&format=BibTex'
    return request(url).text
Пример #2
0
def get_bibtex(noorlib_url):
    """Get bibtex file content from a noormags url. Return as string."""
    pagetext = request(noorlib_url).text
    article_id = BIBTEX_ARTICLE_ID_SEARCH(pagetext)[0]
    url = 'http://www.noorlib.ir/View/HttpHandler/CitationHandler.ashx?id=' +\
          article_id + '&format=BibTex'
    return request(url).text
Пример #3
0
def get_ris(noorlib_url):
    # This is copied from noormags module (currently not supported but may
    # be)[1]
    """Get ris file content from a noormags url. Return as string."""
    pagetext = request(noorlib_url).text
    article_id = RIS_ARTICLE_ID_SEARCH(pagetext)[0]
    url = 'http://www.noormags.ir/view/CitationHandler.ashx?format=RIS&id=' +\
          article_id
    return request(url).text
Пример #4
0
def get_ris(noorlib_url):
    # This is copied from noormags module (currently not supported but may
    # be)[1]
    """Get ris file content from a noormags url. Return as string."""
    pagetext = request(noorlib_url).text
    article_id = RIS_ARTICLE_ID_SEARCH(pagetext)[0]
    url = 'http://www.noormags.ir/view/CitationHandler.ashx?format=RIS&id=' +\
          article_id
    return request(url).text
Пример #5
0
def get_citoid_dict(isbn) -> Optional[dict]:
    # https://www.mediawiki.org/wiki/Citoid/API
    r = request(
        'https://en.wikipedia.org/api/rest_v1/data/citation/mediawiki/' + isbn)
    if r.status_code != 200:
        return
    return r.json()[0]
Пример #6
0
def get_citoid_dict(isbn) -> Optional[dict]:
    # https://www.mediawiki.org/wiki/Citoid/API
    r = request(
        'https://en.wikipedia.org/api/rest_v1/data/citation/mediawiki/' + isbn)
    if r.status_code != 200:
        return
    return r.json()[0]
Пример #7
0
def ottobib(isbn):
    """Convert ISBN to bibtex using ottobib.com."""
    m = OTTOBIB_SEARCH(
        request('http://www.ottobib.com/isbn/' + isbn +
                '/bibtex').content.decode())
    if m is not None:
        return m[1]
Пример #8
0
def googlebooks_scr(parsed_url, date_format='%Y-%m-%d') -> tuple:
    """Create the response namedtuple."""
    parsed_query = parse_qs(parsed_url.query)

    id_ = parsed_query.get('id')
    if id_ is not None:
        volume_id = id_[0]
    else:  # the new URL format
        volume_id = parsed_url.path.rpartition('/')[2]

    dictionary = ris_parse(
        request(
            f'https://{parsed_url.netloc}/books/download/?id={volume_id}'
            f'&output=ris',
            spoof=True).content.decode('utf8'))
    dictionary['date_format'] = date_format
    # manually adding page number to dictionary:
    pg = parsed_query.get('pg')
    if pg is not None:
        pg0 = pg[0]
        dictionary['page'] = pg0[2:]
        dictionary['url'] += f'&pg={pg0}'
    # although google does not provide a language field:
    if not dictionary['language']:
        dictionary['language'] = classify(dictionary['title'])[0]
    return dict_to_sfn_cit_ref(dictionary)
Пример #9
0
def get_html(url: str) -> str:
    """Return the html string for the given url."""
    with request(url, stream=True, spoof=True) as r:
        check_response_headers(r)
        content = next(r.iter_content(MAX_RESPONSE_LENGTH))
    charset_match = CHARSET(content)
    return content.decode(
        charset_match[1].decode() if charset_match else r.encoding)
Пример #10
0
def get_html(url: str) -> str:
    """Return the html string for the given url."""
    with request(
        url, stream=True, spoof=True
    ) as r:
        check_response_headers(r)
        content = next(r.iter_content(MAX_RESPONSE_LENGTH))
    charset_match = CHARSET(content)
    return content.decode(
        charset_match[1].decode() if charset_match else r.encoding)
Пример #11
0
def get_ris(googlebook_url):
    """Get ris file content from a noormags url."""
    # getting id:
    pu = urlparse(googlebook_url)
    pq = parse_qs(pu.query)
    bookid = pq['id'][0]
    url = 'http://books.google.com/books/download/?id=' +\
        bookid + '&output=ris'
    # Agent spoofing is needed, otherwise: HTTP Error 401: Unauthorized
    return request(url, spoof=True).text
Пример #12
0
def url2dictionary(ketabir_url: str) -> Optional[dict]:
    try:
        # Try to see if ketabir is available,
        # ottobib should continoue its work in isbn.py if it is not.
        r = request(ketabir_url)
    except RequestException:
        logger.exception(ketabir_url)
        return
    html = r.content.decode('utf-8')
    d = defaultdict(lambda: None, cite_type='book')
    d['title'] = TITLE_SEARCH(html)[1]
    # initiating name lists:
    others = []
    authors = []
    editors = []
    translators = []
    # building lists:
    for role, name in AUTHORS_FINDALL(html):
        if role == 'نويسنده':
            authors.append(first_last(name))
        elif role == 'مترجم':
            translators.append(first_last(name))
        elif role == 'ويراستار':
            editors.append(first_last(name))
        else:
            others.append(('', name + ' (' + role + ')'))
    if authors:
        d['authors'] = authors
    if others:
        d['others'] = others
    if editors:
        d['editors'] = editors
    if translators:
        d['translators'] = translators
    m = PUBLISHER_SEARCH(html)
    if m:
        d['publisher'] = m[1]
    m = DATE_SEARCH(html)
    if m:
        if LANG != 'fa':
            d['month'] = m['month']
            d['year'] = '۱۳' + m['year']
        else:
            d['month'] = m['month']
            d['year'] = '۱۳' + m['year']
    m = ISBN_SEARCH(html)
    if m:
        d['isbn'] = m[1]
    m = VOLUME_SEARCH(html)
    if m:
        d['volume'] = m[1]
    m = LOCATION_SEARCH(html)
    if m:
        d['publisher-location'] = m[1]
    return d
Пример #13
0
def url2dictionary(ketabir_url: str) -> Optional[dict]:
    try:
        # Try to see if ketabir is available,
        # ottobib should continoue its work in isbn.py if it is not.
        r = request(ketabir_url)
    except RequestException:
        logger.exception(ketabir_url)
        return
    html = r.content.decode('utf-8')
    d = defaultdict(lambda: None, cite_type='book')
    d['title'] = TITLE_SEARCH(html)[1]
    # initiating name lists:
    others = []
    authors = []
    editors = []
    translators = []
    # building lists:
    for role, name in AUTHORS_FINDALL(html):
        if role == 'نويسنده':
            authors.append(first_last(name))
        elif role == 'مترجم':
            translators.append(first_last(name))
        elif role == 'ويراستار':
            editors.append(first_last(name))
        else:
            others.append(('', f'{name} ({role})'))
    if authors:
        d['authors'] = authors
    if others:
        d['others'] = others
    if editors:
        d['editors'] = editors
    if translators:
        d['translators'] = translators
    m = PUBLISHER_SEARCH(html)
    if m:
        d['publisher'] = m[1]
    m = DATE_SEARCH(html)
    if m:
        if LANG != 'fa':
            d['month'] = m['month']
            d['year'] = '۱۳' + m['year']
        else:
            d['month'] = m['month']
            d['year'] = '۱۳' + m['year']
    m = ISBN_SEARCH(html)
    if m:
        d['isbn'] = m[1]
    m = VOLUME_SEARCH(html)
    if m:
        d['volume'] = m[1]
    m = LOCATION_SEARCH(html)
    if m:
        d['publisher-location'] = m[1]
    return d
Пример #14
0
def jstor_scr(url: str, date_format: str = '%Y-%m-%d') -> tuple:
    open_access = []
    thread = Thread(target=is_open_access, args=(url, open_access))
    thread.start()
    id_ = urlparse(url).path.rpartition('/')[2]
    bibtex = request('https://www.jstor.org/citation/text/' + id_).content.decode('utf8')
    dictionary = bibtex_parse(bibtex)
    dictionary['jstor'] = id_
    dictionary['date_format'] = date_format
    thread.join()
    if open_access:
        dictionary['jstor-access'] = 'free'
    return dict_to_sfn_cit_ref(dictionary)
Пример #15
0
def oclc_scr(oclc: str, date_format: str = '%Y-%m-%d') -> tuple:
    text = request('https://www.worldcat.org/oclc/' + oclc + '?page=endnote'
                   '&client=worldcat.org-detailed_record').content.decode()
    if '<html' in text:  # invalid OCLC number
        return ('Error processing OCLC number: ' + oclc,
                'Perhaps you entered an invalid OCLC number?', '')
    d = ris_parse(text)
    authors = d['authors']
    if authors:
        # worldcat has a '.' the end of the first name
        d['authors'] = [(
            fn.rstrip('.') if not fn.isupper() else fn,
            ln.rstrip('.') if not ln.isupper() else ln,
        ) for fn, ln in authors]
    d['date_format'] = date_format
    d['oclc'] = oclc
    d['title'] = d['title'].rstrip('.')
    return dict_to_sfn_cit_ref(d)
Пример #16
0
def get_crossref_dict(doi) -> defaultdict:
    """Return the parsed data of crossref.org for the given DOI."""
    # See https://github.com/CrossRef/rest-api-doc/blob/master/api_format.md
    # for documentation.
    # Force using the version 1 of the API to prevent breakage. See:
    # https://github.com/CrossRef/rest-api-doc/blob/master/rest_api.md#how-to-manage-api-versions
    j = request('http://api.crossref.org/v1/works/' + doi).json()
    assert j['status'] == 'ok'
    d = defaultdict(lambda: None,
                    {k.lower(): v
                     for k, v in j['message'].items()})

    d['cite_type'] = d.pop('type')

    for field in ('title', 'container-title', 'issn', 'isbn'):
        value = d[field]
        if value:
            d[field] = value[0]

    date = d['issued']['date-parts'][0]
    date_len = len(date)
    if date_len == 3:
        d['date'] = datetime_date(*date)
    elif date_len == 2:
        d['year'], d['month'] = str(date[0]), str(date[1])
    else:
        year = date[0]
        # date can be of the form [None]
        # https://github.com/CrossRef/rest-api-doc/issues/169
        if year:
            d['year'] = str(date[0])

    extract_names(d, 'author', 'authors')
    extract_names(d, 'editor', 'editors')
    extract_names(d, 'translator', 'translators')

    page = d['page']
    if page:
        d['page'] = page.replace('-', '–')

    return d
Пример #17
0
def oclc_sfn_cit_ref(oclc: str, date_format: str = '%Y-%m-%d') -> tuple:
    text = request(
        'https://www.worldcat.org/oclc/' + oclc + '?page=endnote'
        '&client=worldcat.org-detailed_record').text
    if '<html' in text:  # invalid OCLC number
        return (
            'Error processing OCLC number: ' + oclc,
            'Perhaps you entered an invalid OCLC number?',
            '')
    d = ris_parse(text)
    authors = d['authors']
    if authors:
        # worldcat has a '.' the end of the first name
        d['authors'] = [(
            fn.rstrip('.') if not fn.isupper() else fn,
            ln.rstrip('.') if not ln.isupper() else ln,
        ) for fn, ln in authors]
    d['date_format'] = date_format
    d['oclc'] = oclc
    d['title'] = d['title'].rstrip('.')
    return dict_to_sfn_cit_ref(d)
Пример #18
0
def get_home_title(url: str, home_title_list: List[str]) -> None:
    """Get homepage of the url and return it's title.

    home_title_list will be used to return the thread result.
    This function is invoked through a thread.
    """
    # Todo: cache the result.
    home_url = '://'.join(urlparse(url)[:2])
    with request(
        home_url, spoof=True, stream=True
    ) as r:
        try:
            check_response_headers(r)
        except (
            RequestException, StatusCodeError,
            ContentTypeError, ContentLengthError,
        ):
            return
        content = next(r.iter_content(MAX_RESPONSE_LENGTH))
    m = CHARSET(content)
    html = content.decode(m[1].decode() if m else r.encoding)
    m = TITLE_TAG(html)
    title = html_unescape(m['result']) if m else None
    home_title_list.append(title)
Пример #19
0
def get_home_title(url: str, home_title_list: List[str]) -> None:
    """Get homepage of the url and return it's title.

    home_title_list will be used to return the thread result.
    This function is invoked through a thread.
    """
    # Todo: cache the result.
    home_url = '://'.join(urlparse(url)[:2])
    with request(home_url, spoof=True, stream=True) as r:
        try:
            check_response_headers(r)
        except (
                RequestException,
                StatusCodeError,
                ContentTypeError,
                ContentLengthError,
        ):
            return
        content = next(r.iter_content(MAX_RESPONSE_LENGTH))
    m = CHARSET(content)
    html = content.decode(m[1].decode() if m else r.encoding)
    m = TITLE_TAG(html)
    title = html_unescape(m['result']) if m else None
    home_title_list.append(title)
Пример #20
0
def ncbi(type_: str, id_: str) -> defaultdict:
    """Return the NCBI data for the given id_."""
    # According to https://www.ncbi.nlm.nih.gov/pmc/tools/get-metadata/
    if type_ == 'pmid':
        json_response = request(PUBMED_URL + id_).json()
    else:  # type_ == 'pmcid'
        json_response = request(PMC_URL + id_).json()
    if 'error' in json_response:
        # Example error message if rates are exceeded:
        # {"error":"API rate limit exceeded","count":"11"}
        # https://www.ncbi.nlm.nih.gov/books/NBK25497/#chapter2.Coming_in_May_2018_API_Keys
        # Return a 503 Service Unavailable
        raise NCBIError(json_response)
    result_get = json_response['result'][id_].get
    d = defaultdict(lambda: None)

    doi = None
    articleids = result_get('articleids', ())
    for articleid in articleids:
        idtype = articleid['idtype']
        if idtype == 'doi':
            doi = articleid['value']
            crossref_dict = {}
            crossref_thread = Thread(target=crossref_update,
                                     args=(crossref_dict, doi))
            crossref_thread.start()
            d['doi'] = doi
        elif idtype == 'pmcid':
            # Use NON_DIGITS_SUB to remove the PMC prefix e.g. in PMC3539452
            d['pmcid'] = NON_DIGITS_SUB('', articleid['value'])
        elif idtype == 'pubmed':
            d['pmid'] = articleid['value']
        else:
            d[idtype] = articleid['value']

    d['issn'] = result_get('issn') or result_get('essn')  # essn is eissn

    d['cite_type'] = result_get('pubtype', ('journal', ))[0]

    d['booktitle'] = result_get('booktitle') or result_get('bookname')
    d['edition'] = result_get('edition')
    d['publisher-location'] = result_get('publisherlocation')
    d['publisher'] = result_get('publishername')
    d['url'] = result_get('availablefromurl')
    d['chapter'] = result_get('chapter')

    date = result_get('pubdate') or result_get('epubdate') \
        or result_get('printpubdate')
    date_split = date.split(' ')
    date_len = len(date_split)
    if date_len == 3:
        d['date'] = datetime.strptime(date, '%Y %b %d')
    elif date_len == 2:
        d['year'], d['month'] = \
            date_split[0], str(b_TO_NUM[date_split[1].lower()])
    else:
        d['year'] = date

    authors = []
    authors_append = authors.append
    for author in result_get('authors', ()):
        if author['authtype'] != 'Author':
            continue
        parts = author['name'].split()
        for i, p in enumerate(parts):
            if p.isupper():
                last = ' '.join(parts[:i])
                first = ' '.join(parts[i:])
                break
        else:
            last = ' '.join(parts[:-1])
            first = parts[-1]
        authors_append((first, last))
    d['authors'] = authors

    d['journal'] = result_get('fulljournalname') or result_get('source')

    for field in ('title', 'volume', 'issue'):
        d[field] = result_get(field)

    d['page'] = result_get('pages', '').replace('-', '–')

    lang = result_get('lang')
    if lang:
        d['language'] = lang[0]

    if doi:
        # noinspection PyUnboundLocalVariable
        crossref_thread.join()
        # noinspection PyUnboundLocalVariable
        d.update(crossref_dict)

    return d
Пример #21
0
def get_bibtex(noormags_url):
    """Get BibTex file content from a noormags_url. Return as string."""
    page_text = request(noormags_url).text
    article_id = BIBTEX_ARTICLE_ID_SEARCH(page_text)[0]
    url = 'http://www.noormags.ir/view/fa/citation/bibtex/' + article_id
    return request(url).text
Пример #22
0
def get_ris(noormags_url):
    """Get ris file content from a noormags url. Return as string."""
    page_text = request(noormags_url).text
    article_id = RIS_ARTICLE_ID_SEARCH(page_text)[0]
    return request(
        'http://www.noormags.ir/view/fa/citation/ris/' + article_id).text
Пример #23
0
def is_open_access(url: str, result: list):
    if '"openAccess" : "True"' in request(url, spoof=True).text:
        result.append(True)
Пример #24
0
def ncbi(type_: str, id_: str) -> defaultdict:
    """Return the NCBI data for the given id_."""
    # According to https://www.ncbi.nlm.nih.gov/pmc/tools/get-metadata/
    if type_ == 'pmid':
        json_response = request(PUBMED_URL + id_).json()
    else:  # type_ == 'pmcid'
        json_response = request(PMC_URL + id_).json()
    if 'error' in json_response:
        # Example error message if rates are exceeded:
        # {"error":"API rate limit exceeded","count":"11"}
        # https://www.ncbi.nlm.nih.gov/books/NBK25497/#chapter2.Coming_in_May_2018_API_Keys
        # Return a 503 Service Unavailable
        raise NCBIError(json_response)
    result_get = json_response['result'][id_].get
    d = defaultdict(lambda: None)

    doi = None
    articleids = result_get('articleids', ())
    for articleid in articleids:
        idtype = articleid['idtype']
        if idtype == 'doi':
            doi = articleid['value']
            crossref_dict = {}
            crossref_thread = Thread(
                target=crossref_update, args=(crossref_dict, doi))
            crossref_thread.start()
            d['doi'] = doi
        elif idtype == 'pmcid':
            # Use NON_DIGITS_SUB to remove the PMC prefix e.g. in PMC3539452
            d['pmcid'] = NON_DIGITS_SUB('', articleid['value'])
        elif idtype == 'pubmed':
            d['pmid'] = articleid['value']
        else:
            d[idtype] = articleid['value']

    d['issn'] = result_get('issn') or result_get('essn')  # essn is eissn

    d['cite_type'] = result_get('pubtype', ('journal',))[0]

    d['booktitle'] = result_get('booktitle') or result_get('bookname')
    d['edition'] = result_get('edition')
    d['publisher-location'] = result_get('publisherlocation')
    d['publisher'] = result_get('publishername')
    d['url'] = result_get('availablefromurl')
    d['chapter'] = result_get('chapter')

    date = result_get('pubdate') or result_get('epubdate') \
        or result_get('printpubdate')
    date_split = date.split(' ')
    date_len = len(date_split)
    if date_len == 3:
        d['date'] = datetime.strptime(date, '%Y %b %d')
    elif date_len == 2:
        d['year'], d['month'] = \
            date_split[0], str(b_TO_NUM[date_split[1].lower()])
    else:
        d['year'] = date

    authors = []
    authors_append = authors.append
    for author in result_get('authors', ()):
        if author['authtype'] != 'Author':
            continue
        parts = author['name'].split()
        for i, p in enumerate(parts):
            if p.isupper():
                last = ' '.join(parts[:i])
                first = ' '.join(parts[i:])
                break
        else:
            last = ' '.join(parts[:-1])
            first = parts[-1]
        authors_append((first, last))
    d['authors'] = authors

    d['journal'] = result_get('fulljournalname') or result_get('source')

    for field in ('title', 'volume', 'issue'):
        d[field] = result_get(field)

    d['page'] = result_get('pages', '').replace('-', '–')

    lang = result_get('lang')
    if lang:
        d['language'] = lang[0]

    if doi:
        # noinspection PyUnboundLocalVariable
        crossref_thread.join()
        # noinspection PyUnboundLocalVariable
        d.update(crossref_dict)

    return d
Пример #25
0
def ottobib(isbn):
    """Convert ISBN to bibtex using ottobib.com."""
    m = OTTOBIB_SEARCH(
        request('http://www.ottobib.com/isbn/' + isbn + '/bibtex').text)
    if m:
        return m[1]
Пример #26
0
def ottobib(isbn):
    """Convert ISBN to bibtex using ottobib.com."""
    m = OTTOBIB_SEARCH(
        request('http://www.ottobib.com/isbn/' + isbn + '/bibtex').text)
    if m:
        return m[1]
Пример #27
0
def get_bibtex(noormags_url):
    """Get BibTex file content from a noormags_url. Return as string."""
    page_text = request(noormags_url).text
    article_id = BIBTEX_ARTICLE_ID_SEARCH(page_text)[0]
    url = 'http://www.noormags.ir/view/fa/citation/bibtex/' + article_id
    return request(url).text
Пример #28
0
def get_ris(noormags_url):
    """Get ris file content from a noormags url. Return as string."""
    page_text = request(noormags_url).text
    article_id = RIS_ARTICLE_ID_SEARCH(page_text)[0]
    return request('http://www.noormags.ir/view/fa/citation/ris/' +
                   article_id).text