Exemplo n.º 1
0
def _get_paper(paper_metadata: dict) -> Paper:
    """
    Get Paper object from metadata

    Parameters
    ----------
    paper_metadata : dict
        Paper metadata

    Returns
    -------
    Paper
        Paper object
    """

    paper_title = paper_metadata.get('title')
    paper_abstract = paper_metadata.get('abstract')
    paper_authors = [x.strip() for x in paper_metadata.get('authors').split(';')]
    publication = None
    paper_publication_date = datetime.datetime.strptime(paper_metadata.get('date'), '%Y-%m-%d').date()
    paper_url = f'https://doi.org/{paper_metadata.get("doi")}'
    paper_doi = paper_metadata.get("doi")
    paper_citations = None
    paper_keywords = None
    paper_comments = None
    paper_number_of_pages = None
    paper_pages = None

    if paper_metadata.get('published').lower() != 'na':
        paper_doi = paper_metadata.get('published').replace('\\', '')

    return Paper(paper_title, paper_abstract, paper_authors, publication,
                  paper_publication_date, {paper_url}, paper_doi,
                  paper_citations, paper_keywords, paper_comments, paper_number_of_pages, paper_pages)
Exemplo n.º 2
0
def _get_paper(paper_entry: dict, paper_publication_date: datetime.date,
               publication: Publication) -> Paper:
    """
    Using a paper entry provided, this method builds a paper instance

    Parameters
    ----------
    paper_entry : dict
        A paper entry retrieved from arXiv API
    paper_publication_date : datetime.date
        The paper publication date
    publication : Publication
        A publication instance that will be associated with the paper

    Returns
    -------
    Paper
        A paper instance
    """

    paper_title = paper_entry.get('title', None)

    if paper_title is None or len(paper_title) == 0:
        return None

    paper_title = paper_title.replace('\n', '')
    paper_title = re.sub(' +', ' ', paper_title)

    paper_doi = paper_entry.get('arxiv:doi').get(
        '#text') if 'arxiv:doi' in paper_entry else None
    paper_abstract = paper_entry.get('summary', None)
    paper_urls = set()
    paper_authors = []

    if 'link' in paper_entry:
        if isinstance(paper_entry.get('link'), list):
            for link in paper_entry.get('link'):
                paper_urls.add(link.get('@href'))
        else:
            paper_urls.add(paper_entry.get('link').get('@href'))

    if 'author' in paper_entry:
        if isinstance(paper_entry.get('author'), list):
            for author in paper_entry.get('author'):
                paper_authors.append(author.get('name'))
        else:
            paper_authors.append(paper_entry.get('author').get('name'))

    paper_comments = paper_entry.get('arxiv:comment', {}).get('#text', None)

    paper = Paper(paper_title,
                  paper_abstract,
                  paper_authors,
                  publication,
                  paper_publication_date,
                  paper_urls,
                  paper_doi,
                  comments=paper_comments)

    return paper
Exemplo n.º 3
0
def paper(publication):
    title = 'awesome paper title'
    abstract = 'a long abstract'
    authors = ['Dr Paul', 'Dr John', 'Dr George', 'Dr Ringo']
    publication_date = datetime.date(1969, 1, 30)
    paper_url = "https://en.wikipedia.org/wiki/The_Beatles'_rooftop_concert"
    urls = {paper_url}
    doi = 'fake-doi'
    citations = 25
    keywords = {'term A', 'term B'}
    comments = 'some comments'
    number_of_pages = 4
    pages = '1-4'
    databases = {'arXiv', 'ACM', 'IEEE', 'PubMed', 'Scopus'}
    selected = True
    categories = {'Facet A': ['Category A', 'Category B']}

    paper = Paper(title, abstract, authors, publication, publication_date, urls, doi, citations, keywords,
                  comments, number_of_pages, pages, databases, selected, categories)

    return paper
Exemplo n.º 4
0
def _get_paper(paper_entry: dict, publication: Publication) -> Paper:
    """
    Using a paper entry provided, this method builds a paper instance

    Parameters
    ----------
    paper_entry : dict
        A paper entry retrieved from IEEE API
    publication : Publication
        A publication instance that will be associated with the paper

    Returns
    -------
    Paper
        A paper instance or None
    """

    article = paper_entry.get('PubmedArticleSet').get(
        'PubmedArticle').get('MedlineCitation').get('Article')

    paper_title = article.get('ArticleTitle', None)

    if paper_title is None or len(paper_title) == 0:
        return None

    paper_title = paper_title if isinstance(paper_title, str) else paper_title.get('#text')

    if 'ArticleDate' in article:
        paper_publication_date_day = article.get('ArticleDate').get('Day')
        paper_publication_date_month = article.get('ArticleDate').get('Month')
        paper_publication_date_year = article.get('ArticleDate').get('Year')
    else:
        paper_publication_date_day = 1
        paper_publication_date_month = common_util.get_numeric_month_by_string(
            article.get('Journal').get('JournalIssue').get('PubDate').get('Month'))
        paper_publication_date_year = article.get('Journal').get(
            'JournalIssue').get('PubDate').get('Year')

    paper_doi = None
    paper_ids = paper_entry.get('PubmedArticleSet').get('PubmedArticle').get(
        'PubmedData').get('ArticleIdList').get('ArticleId')
    for paper_id in paper_ids:
        if paper_id.get('@IdType') == 'doi':
            paper_doi = paper_id.get('#text')
            break

    paper_abstract = None
    paper_abstract_entry = article.get('Abstract', {}).get('AbstractText', None)
    if paper_abstract_entry is None:
        raise ValueError('Paper abstract is empty')

    if isinstance(paper_abstract_entry, list):
        paper_abstract = '\n'.join(
            [x.get('#text') for x in paper_abstract_entry if x.get('#text') is not None])
    else:
        paper_abstract = paper_abstract_entry if isinstance(paper_abstract_entry, str) else paper_abstract_entry.get('#text')

    try:
        paper_keywords = set([x.get('#text').strip() for x in paper_entry.get('PubmedArticleSet').get(
            'PubmedArticle').get('MedlineCitation').get('KeywordList').get('Keyword')])
    except Exception:
        paper_keywords = set()

    try:
        paper_publication_date = datetime.date(int(paper_publication_date_year), int(
            paper_publication_date_month), int(paper_publication_date_day))
    except Exception:
        paper_publication_date = datetime.date(
            int(paper_publication_date_year), 1, 1)

    if paper_publication_date is None:
        return None

    paper_authors = []
    retrived_authors = []
    if isinstance(article.get('AuthorList').get('Author'), dict): # only one author
        retrived_authors = [article.get('AuthorList').get('Author')]
    else:
        retrived_authors = article.get('AuthorList').get('Author')

    for author in retrived_authors:
        if isinstance(author, str):
            paper_authors.append(author)
        elif isinstance(author, dict):
            paper_authors.append(f"{author.get('ForeName')} {author.get('LastName')}")

    paper_pages = None
    paper_number_of_pages = None
    try:
        paper_pages = article.get('Pagination').get('MedlinePgn')
        if not paper_pages.isdigit(): # if it's a digit, the paper pages range is invalid
            pages_split = paper_pages.split('-')
            paper_number_of_pages = abs(int(pages_split[0])-int(pages_split[1]))+1
    except Exception:  # pragma: no cover
        pass

    paper = Paper(paper_title, paper_abstract, paper_authors, publication,
                  paper_publication_date, set(), paper_doi, None, paper_keywords, None, 
                  paper_number_of_pages, paper_pages)

    return paper
Exemplo n.º 5
0
def _get_paper(paper_entry: dict, publication: Publication) -> Paper:
    """
    Using a paper entry provided, this method builds a paper instance

    Parameters
    ----------
    paper_entry : dict
        A paper entry retrieved from IEEE API
    publication : Publication
        A publication instance that will be associated with the paper

    Returns
    -------
    Paper
        A paper instance or None
    """

    paper_title = paper_entry.get('title', None)

    if paper_title is None or len(paper_title) == 0:
        return None

    paper_publication_date = paper_entry.get('publication_date', None)
    paper_doi = paper_entry.get('doi', None)
    paper_citations = paper_entry.get('citing_paper_count', None)
    paper_abstract = paper_entry.get('abstract', None)
    paper_urls = {paper_entry.get('pdf_url')}
    paper_pages = None
    paper_number_of_pages = None

    try:
        paper_keywords = set([ x.strip() for x in paper_entry.get(
            'index_terms').get('author_terms').get('terms')])
    except Exception as e:
        paper_keywords = set()

    if paper_publication_date is not None:
        try:
            paper_publication_date_split = paper_publication_date.split(' ')
            day = int(paper_publication_date_split[0].split('-')[0])
            month = int(common_util.get_numeric_month_by_string(
                paper_publication_date_split[1]))
            year = int(paper_publication_date_split[2])

            paper_publication_date = datetime.date(year, month, day)
        except Exception as e:
            pass

    if not isinstance(paper_publication_date, datetime.date):
        paper_publication_date = datetime.date(
            paper_entry.get('publication_year'), 1, 1)

    if paper_publication_date is None:
        return None

    paper_authors = []
    for author in paper_entry.get('authors').get('authors'):
        paper_authors.append(author.get('full_name'))

    start_page = paper_entry.get('start_page', None)
    end_page = paper_entry.get('end_page', None)
    

    if start_page is not None and end_page is not None:
        try:
            paper_pages = f"{paper_entry.get('start_page')}-{paper_entry.get('end_page')}"
            paper_number_of_pages = abs(
                int(paper_entry.get('start_page'))-int(paper_entry.get('end_page')))+1
        except Exception:  # pragma: no cover
            pass

    paper = Paper(paper_title, paper_abstract, paper_authors, publication,
                  paper_publication_date, paper_urls, paper_doi, paper_citations, 
                  paper_keywords, None, paper_number_of_pages, paper_pages)

    return paper
Exemplo n.º 6
0
def _get_paper(paper_page: html.HtmlElement, paper_doi: str, paper_url: str) -> Paper:
    """
    Using a paper entry provided, this method builds a paper instance

    Parameters
    ----------
    paper_page : html.HtmlElement
        A paper page retrieved from ACM
    paper_doi : str
        The paper DOI
    paper_url : str
        The ACM paper URL

    Returns
    -------
    Paper
        A paper instance
    """

    paper_abstract = paper_page.xpath(
        '//*[contains(@class, "abstractSection")]/p')[0].text

    citation_elements = paper_page.xpath(
        '//*[contains(@class, "article-metric citation")]//span')
    paper_citations = None
    if len(citation_elements) == 1:
        paper_citations = int(citation_elements[0].text)

    paper_metadata = _get_paper_metadata(paper_doi)

    if paper_metadata is None:
        return None

    publication = None
    publication_title = paper_metadata.get('container-title', None)

    if publication_title is not None and len(publication_title) > 0:

        publication_isbn = paper_metadata.get('ISBN', None)
        publication_issn = paper_metadata.get('ISSN', None)
        publication_publisher = paper_metadata.get('publisher', None)
        publication_category = paper_metadata.get('type', None)

        publication = Publication(publication_title, publication_isbn,
                                publication_issn, publication_publisher, publication_category)

    paper_title = paper_metadata.get('title', None)

    if paper_title is None or len(paper_title) == 0:
        return None

    paper_authors = paper_metadata.get('author', [])
    paper_authors = ['{} {}'.format(
        x.get('given'), x.get('family')) for x in paper_authors]

    paper_publication_date = None
    if paper_metadata.get('issued', None) != None:
        date_parts = paper_metadata['issued']['date-parts'][0]
        if len(date_parts) == 1:  # only year
            paper_publication_date = datetime.date(date_parts[0], 1, 1)
        else:
            paper_publication_date = datetime.date(
                date_parts[0], date_parts[1], date_parts[2])

    if paper_publication_date is None:
        return None

    paper_keywords = set()
    if paper_metadata.get('keyword', None) is not None:
        paper_keywords = set([x.strip()
                              for x in paper_metadata['keyword'].split(',')])

    paper_pages = paper_metadata.get('page', None)
    if paper_pages is not None:
        paper_pages = paper_pages.replace('\u2013', '-')

    paper_number_of_pages = paper_metadata.get('number-of-pages', None)
    if paper_number_of_pages is not None:
        paper_number_of_pages = int(paper_number_of_pages)

    if paper_doi is None:
        paper_doi = paper_metadata.get('DOI')

    paper = Paper(paper_title, paper_abstract, paper_authors, publication,
                  paper_publication_date, {paper_url}, paper_doi,
                  paper_citations, paper_keywords, None, paper_number_of_pages, paper_pages)

    return paper
Exemplo n.º 7
0
def _get_paper(paper_entry: dict, publication: Publication) -> Paper:
    """
    Using a paper entry provided, this method builds a paper instance

    Parameters
    ----------
    paper_entry : dict
        A paper entry retrieved from scopus API
    publication : Publication
        A publication instance that will be associated with the paper

    Returns
    -------
    Paper
        A paper instance or None
    """

    # getting data

    paper_title = paper_entry.get('dc:title', None)

    if paper_title is None or len(paper_title) == 0:
        return None

    paper_publication_date = paper_entry.get('prism:coverDate', None)
    paper_doi = paper_entry.get('prism:doi', None)
    paper_citations = paper_entry.get('citedby-count', None)
    paper_first_author = paper_entry.get('dc:creator', None)
    paper_abstract = None
    paper_authors = []
    paper_urls = set()
    paper_keywords = set()
    paper_pages = None
    paper_number_of_pages = None

    # post processing data

    if paper_first_author is not None:
        paper_authors.append(paper_first_author)

    if paper_publication_date is not None:
        date_split = paper_publication_date.split('-')
        paper_publication_date = datetime.date(
            int(date_split[0]), int(date_split[1]), int(date_split[2]))

    if paper_publication_date is None:
        return None

    if paper_citations is not None:
        paper_citations = int(paper_citations)

    # enriching data

    paper_scopus_link = None
    for link in paper_entry.get('link', []):
        if link.get('@ref') == 'scopus':
            paper_scopus_link = link.get('@href')
            break

    if paper_scopus_link is not None:

        paper_urls.add(paper_scopus_link)

        try:

            paper_page = _get_paper_page(paper_scopus_link)

            paper_abstract = paper_page.xpath(
                '//section[@id="abstractSection"]//p//text()[normalize-space()]')
            if len(paper_abstract) > 0:
                paper_abstract = re.sub(
                    '\xa0', ' ', ''.join(paper_abstract)).strip()

            authors = paper_page.xpath(
                '//*[@id="authorlist"]/ul/li/span[@class="previewTxt"]')
            paper_authors = []
            for author in authors:
                paper_authors.append(author.text.strip())

            keywords = paper_page.xpath('//*[@id="authorKeywords"]/span')
            for keyword in keywords:
                paper_keywords.add(keyword.text.strip())

            try:
                paper_pages = paper_page.xpath(
                    '//span[@id="journalInfo"]')[0].text.split('Pages')[1].strip()
                if paper_pages.isdigit():  # pragma: no cover
                    paper_number_of_pages = 1
                else:
                    pages_split = paper_pages.split('-')
                    paper_number_of_pages = abs(
                        int(pages_split[0])-int(pages_split[1]))+1
            except Exception:  # pragma: no cover
                pass

        except Exception as e:
            logging.debug(e, exc_info=True)

    paper = Paper(paper_title, paper_abstract, paper_authors, publication,
                  paper_publication_date, paper_urls, paper_doi, paper_citations, paper_keywords,
                  None, paper_number_of_pages, paper_pages)

    return paper
Exemplo n.º 8
0
def test_output(search: Search, paper: Paper):

    paper.publication.category = 'Journal'
    paper.categories = {'Facet A': ['Category A', 'Category B']}
    paper.selected = False
    search.add_paper(paper)

    other_paper = copy.deepcopy(paper)
    other_paper.publication.issn = 'ISSN-CONF'
    other_paper.publication.category = 'Conference Proceedings'
    other_paper.title = 'Conference paper title'
    other_paper.doi = 'fake-doi-conference-paper'
    other_paper.selected = True
    other_paper.categories = {
        'Facet A': ['Category C'],
        'Facet B': ['Category 1']
    }
    search.add_paper(other_paper)

    other_paper = copy.deepcopy(paper)
    other_paper.publication.issn = 'ISSN-BOOK'
    other_paper.publication.category = 'Book'
    other_paper.title = 'Book paper title'
    other_paper.doi = 'fake-doi-book-paper'
    other_paper.categories = None
    search.add_paper(other_paper)

    other_paper = copy.deepcopy(paper)
    other_paper.publication = None
    other_paper.title = 'Unpublished paper title'
    other_paper.doi = None
    other_paper.selected = True
    other_paper.categories = {'Facet A': ['Category A']}
    search.add_paper(other_paper)

    search_path = tempfile.NamedTemporaryFile().name
    outputpath = tempfile.NamedTemporaryFile().name

    persistence_util.save(search, search_path)

    findpapers.generate_bibtex(search_path, outputpath)
    with open(outputpath) as fp:
        generated_bibtex = fp.read()

    article_header = '@article{drpaul1969awesome'
    inproceedings_header = '@inproceedings{drpaul1969conference'
    book_header = '@book{drpaul1969book'
    unpublished = '@unpublished{drpaul1969unpublished'

    assert article_header in generated_bibtex
    assert inproceedings_header in generated_bibtex
    assert book_header in generated_bibtex
    assert unpublished in generated_bibtex

    findpapers.generate_bibtex(search_path,
                               outputpath,
                               only_selected_papers=True)
    with open(outputpath) as fp:
        generated_bibtex = fp.read()

    assert article_header not in generated_bibtex
    assert inproceedings_header in generated_bibtex
    assert book_header not in generated_bibtex
    assert unpublished in generated_bibtex

    findpapers.generate_bibtex(search_path,
                               outputpath,
                               categories_filter={
                                   'Facet A': ['Category A'],
                                   'Facet B': ['Category 1']
                               })
    with open(outputpath) as fp:
        generated_bibtex = fp.read()

    assert article_header in generated_bibtex
    assert inproceedings_header in generated_bibtex
    assert book_header not in generated_bibtex
    assert unpublished in generated_bibtex

    findpapers.generate_bibtex(
        search_path,
        outputpath,
        categories_filter={'Facet A': ['Category B', 'Category C']})
    with open(outputpath) as fp:
        generated_bibtex = fp.read()

    assert article_header in generated_bibtex
    assert inproceedings_header in generated_bibtex
    assert book_header not in generated_bibtex
    assert unpublished not in generated_bibtex
Exemplo n.º 9
0
def test_paper(paper: Paper):

    assert paper.title == 'awesome paper title'
    assert paper.abstract == 'a long abstract'
    assert paper.authors == ['Dr Paul', 'Dr John', 'Dr George', 'Dr Ringo']
    assert len(paper.urls) == 1
    assert len(paper.databases) == 5

    paper.databases = set()

    with pytest.raises(ValueError):
        paper.add_database('INVALID DATABASE')

    paper.add_database('Scopus')
    paper.add_database('Scopus')
    assert len(paper.databases) == 1

    paper.add_database('ACM')
    assert len(paper.databases) == 2

    assert len(paper.urls) == 1
    paper.add_url(next(iter(paper.urls)))
    assert len(paper.urls) == 1

    paper.add_url('another://url')
    assert len(paper.urls) == 2

    another_paper_citations = 10
    another_doi = 'DOI-X'
    another_keywords = {'key-A', 'key-B', 'key-C'}
    another_comments = 'some comments'

    another_paper = Paper('another awesome title paper', 'a long abstract',
                          paper.authors, paper.publication,
                          paper.publication_date, paper.urls, another_doi,
                          another_paper_citations, another_keywords,
                          another_comments)
    another_paper.add_database('arXiv')

    paper.publication_date = None
    paper.abstract = None
    paper.authors = None
    paper.keywords = None
    paper.publication = None
    paper.doi = None
    paper.citations = 0
    paper.comments = None
    paper.number_of_pages = None
    paper.pages = None

    paper.enrich(another_paper)
    assert paper.publication_date == another_paper.publication_date
    assert paper.abstract == another_paper.abstract
    assert paper.authors == another_paper.authors
    assert paper.keywords == another_paper.keywords

    assert 'arXiv' in paper.databases
    assert len(paper.databases) == 3
    assert paper.doi == another_doi
    assert paper.citations == another_paper_citations  # 'cause another_paper_citations was higher than paper_citations
    assert paper.keywords == another_keywords
    assert paper.comments == another_comments
Exemplo n.º 10
0
def test_search(paper: Paper):

    paper.doi = None

    search = Search('this AND that', datetime.date(1969, 1, 30),
                    datetime.date(1970, 4, 8), 2)

    assert len(search.papers) == 0

    search.add_paper(paper)
    assert len(search.papers) == 1
    search.add_paper(paper)
    assert len(search.papers) == 1

    another_paper = Paper('awesome paper title 2', 'a long abstract',
                          paper.authors, paper.publication,
                          paper.publication_date, paper.urls)
    another_paper.add_database('arXiv')

    search.add_paper(another_paper)
    assert len(search.papers) == 2

    assert paper == search.get_paper(paper.title, paper.publication_date,
                                     paper.doi)
    assert paper.publication == search.get_publication(paper.publication.title,
                                                       paper.publication.issn,
                                                       paper.publication.isbn)

    search.remove_paper(another_paper)
    assert len(search.papers) == 1
    assert paper in search.papers

    search.limit_per_database = 1
    with pytest.raises(OverflowError):
        search.add_paper(another_paper)
    search.limit_per_database = 2

    search.add_paper(another_paper)
    assert len(search.papers) == 2

    another_paper_2 = copy.deepcopy(paper)
    another_paper_2.title = 'awesome paper title 3'
    another_paper_2.abstract = 'a long abstract'
    another_paper_2.databases = set()

    with pytest.raises(ValueError):
        search.add_paper(another_paper_2)

    another_paper_2.add_database('arXiv')

    with pytest.raises(OverflowError):
        search.add_paper(another_paper_2)

    search.merge_duplications()
    assert len(search.papers) == 1

    publication_title = 'FAKE-TITLE'
    publication_issn = 'FAKE-ISSN'
    publication_isbn = 'FAKE-ISBN'
    assert search.get_publication_key(
        publication_title, publication_issn,
        publication_isbn) == f'ISBN-{publication_isbn.lower()}'
    assert search.get_publication_key(
        publication_title,
        publication_issn) == f'ISSN-{publication_issn.lower()}'
    assert search.get_publication_key(
        publication_title) == f'TITLE-{publication_title.lower()}'