예제 #1
0
def test_retrieve_authors_from_sourceyear():
    make_database(test_cache, drop=True)
    conn = connect_database(test_cache)
    # Variables
    expected_sources = [22900]
    expected_years = [2005, 2010]
    df = pd.DataFrame(product(expected_sources, expected_years),
                      columns=["source_id", "year"],
                      dtype="int64")
    # Populate cache
    expected = query_pubs_by_sourceyear(expected_sources,
                                        expected_years[0],
                                        refresh=refresh)
    expected["source_id"] = expected["source_id"].astype(np.int64)
    expected["afid"] = expected["afid"].astype(int).astype(str)
    expected = expected.sort_values(["auids", "afid"]).reset_index(drop=True)
    expected = expected[['source_id', 'year', 'auids', 'afid']]
    expected["auids"] = expected["auids"].str.split(";")
    insert_data(expected, conn, table="sources_afids")
    # Retrieve from cache
    incache, missing = retrieve_authors_from_sourceyear(df, conn)
    incache["afid"] = incache["afid"].astype(int).astype(str)
    incache = incache.sort_values(["auids", "afid"]).reset_index(drop=True)
    assert_frame_equal(incache, expected)
    assert_frame_equal(missing, df.tail(1).reset_index(drop=True))
예제 #2
0
def test_retrieve_author_info_authoryear():
    make_database(test_cache, drop=True)
    conn = connect_database(test_cache)
    # Variables
    table = "author_year"
    expected_auth = [53164702100, 57197093438]
    search_auth = [55317901900]
    year = 2016
    df2 = pd.DataFrame(expected_auth + search_auth,
                       columns=["auth_id"],
                       dtype="int64")
    df2["year"] = year
    # Insert data
    fill = robust_join(expected_auth, sep=') OR AU-ID(')
    q = f"(AU-ID({fill})) AND PUBYEAR BEF {year+1}"
    d = build_dict(ScopusSearch(q, refresh=refresh).results, expected_auth)
    expected = pd.DataFrame.from_dict(d, orient="index", dtype="int64")
    expected = expected.sort_index().rename_axis('auth_id').reset_index()
    expected["year"] = year
    expected = expected[[
        'auth_id', 'year', 'first_year', 'n_pubs', 'n_coauth'
    ]]
    insert_data(expected, conn, table=table)
    # Retrieve data
    incache, missing = retrieve_author_info(df2, conn, table)
    assert_frame_equal(incache, expected)
    assert_equal(missing['auth_id'].tolist(), search_auth)
    assert_equal(missing['year'].tolist(), [year])
예제 #3
0
def test_retrieve_authors():
    make_database(test_cache, drop=True)
    conn = connect_database(test_cache)
    # Variables
    expected_auth = [53164702100, 57197093438]
    df = pd.DataFrame(expected_auth, columns=["auth_id"], dtype="int64")
    expected_cols = [
        'auth_id', 'eid', 'surname', 'initials', 'givenname', 'affiliation',
        'documents', 'affiliation_id', 'city', 'country', 'areas'
    ]
    # Retrieve data
    incache, missing = retrieve_authors(df, conn)
    assert_equal(incache.shape[0], 0)
    assert_equal(incache.columns.to_list(), expected_cols)
    assert_equal(missing, expected_auth)
예제 #4
0
def test_retrieve_author_info_authorncits():
    make_database(test_cache, drop=True)
    conn = connect_database(test_cache)
    # Variables
    table = "author_ncits"
    data = {
        "auth_id": [53164702100, 53164702100],
        "year": [2010, 2017],
        "n_cits": [0, 6]
    }
    expected = pd.DataFrame(data, dtype="int64")
    # Insert data
    insert_data(expected, conn, table=table)
    # Retrieve data
    cols = ["auth_id", "year"]
    incache, tosearch = retrieve_author_info(expected[cols], conn, table)
    assert_frame_equal(incache, expected)
    assert_true(tosearch.empty)
예제 #5
0
def test_retrieve_authors_insert():
    make_database(test_cache, drop=True)
    conn = connect_database(test_cache)
    # Variables
    expected_auth = [53164702100, 57197093438]
    search_auth = [55317901900]
    expected_cols = [
        'auth_id', 'eid', 'surname', 'initials', 'givenname', 'affiliation',
        'documents', 'affiliation_id', 'city', 'country', 'areas'
    ]
    # Insert data
    q = f"AU-ID({robust_join(expected_auth, sep=') OR AU-ID(')})"
    res = pd.DataFrame(AuthorSearch(q, refresh=refresh).authors, dtype="int64")
    res["auth_id"] = res["eid"].str.split("-").str[-1]
    res = res[expected_cols]
    insert_data(res, conn, table="authors")
    # Retrieve data
    df = pd.DataFrame(expected_auth + search_auth,
                      columns=["auth_id"],
                      dtype="int64")
    incache, missing = retrieve_authors(df, conn)
    assert_equal(incache.shape[0], 2)
    assert_equal(missing, [55317901900])
예제 #6
0
    def __init__(self,
                 identifier,
                 year,
                 refresh=False,
                 period=None,
                 eids=None,
                 sql_fname=None):
        """Class to represent a scientist.

        Parameters
        ----------
        identifier : list of int
            List of Scopus Author IDs of the scientist.

        year : str or numeric
            Year for which characteristics should be defined for.

        refresh : boolean or int (optional, default=False)
            Whether to refresh cached results (if they exist) or not. If int
            is passed, results will be refreshed if they are older than
            that value in number of days.

        eids : list (optional, default=None)
            A list of scopus EIDs of the publications of the scientist.  If
            it is provided, the scientist's properties are set based on these
            publications, instead of the list of publications obtained from
            the Scopus Author ID(s).

        period: int (optional, default=None)
            In additional starting x years prior to the treatment year,
            which is also used to compute characteristics in the treatment
            year.

        sql_fname : str (optional, default=None)
            The path of the SQLite database to connect to.  If None, will use
            the path specified in config.ini.

        Raises
        ------
        Exception
            When there are no publications for the author until the
            provided year.
        """
        self.identifier = identifier
        self.year = int(year)
        if not sql_fname:
            sql_fname = config.get('Filepaths', 'Database')
        self.sql_conn = connect_database(sql_fname)

        # Read mapping of fields to sources
        df, names = read_fields_sources_list()
        self.field_source = df
        self.source_names = names.set_index("source_id")["title"].to_dict()

        # Load list of publications
        if eids:
            q = f"EID({' OR '.join(eids)})"
        else:
            q = f"AU-ID({') OR AU-ID('.join([str(i) for i in identifier])})"
        integrity_fields = ["eid", "author_ids", "coverDate", "source_id"]
        res = base_query("docs", q, refresh, fields=integrity_fields)
        self._publications = [p for p in res if int(p.coverDate[:4]) <= year]
        if not len(self._publications):
            text = "No publications found for author "\
                   f"{'-'.join([str(i) for i in identifier])} until {year}"
            raise Exception(text)
        self._eids = eids or [p.eid for p in self._publications]

        # First year of publication
        pub_years = [p.coverDate[:4] for p in self._publications]
        self._first_year = int(min(pub_years))
        self._period_year = self.year - (period or (self.year + 1)) + 1
        if self._period_year < self._first_year:
            self._period_year = 0

        # Count of citations
        search_ids = eids or identifier
        self._citations = count_citations(search_ids, self.year + 1,
                                          identifier)

        # Coauthors
        self._coauthors = set(extract_authors(
            self._publications)) - set(identifier)

        # Period counts simply set to total if period is or goes back to None
        if self._period_year:
            pubs = [
                p for p in self._publications
                if self._period_year <= int(p.coverDate[:4]) <= year
            ]
            self._publications_period = pubs
            if not len(self._publications_period):
                text = "No publications found for author "\
                       f"{'-'.join([str(i) for i in identifier])} until "\
                       f"{year} in a {self._period_year}-years period"
                raise Exception(text)
            eids_period = [p.eid for p in self._publications_period]
            n_cits = count_citations(eids_period, self.year + 1, identifier)
            self._citations_period = n_cits
            self._coauthors_period = set(
                extract_authors(self._publications_period))
            self._coauthors_period -= set(identifier)
        else:
            self._coauthors_period = None
            self._publications_period = None
            self._citations_period = None

        # Author search information
        source_ids = set(
            [int(p.source_id) for p in self._publications if p.source_id])
        self._sources = add_source_names(source_ids, self.source_names)
        self._active_year = int(max(pub_years))
        mask = df["source_id"].isin(source_ids)
        self._fields = df[mask]["asjc"].astype(int).tolist()
        self._main_field = get_main_field(self._fields)
        if not self._main_field[0]:
            text = "Not possible to determine research field(s) of "\
                   "researcher.  Functionality is reduced."
            warn(text, UserWarning)

        # Most recent geolocation
        afid = find_main_affiliation(identifier, self._publications, year)
        self._affiliation_id = afid
        try:
            aff = AffiliationRetrieval(afid, refresh=refresh)
            self._affiliation_country = aff.country
            self._affiliation_name = aff.affiliation_name
            self._affiliation_type = aff.org_type
        except (Scopus404Error, ValueError):
            self._affiliation_country = None
            self._affiliation_name = None
            self._affiliation_type = None
        self._language = None

        # Author name from profile with most documents
        df = get_authors(self.identifier,
                         self.sql_conn,
                         refresh=refresh,
                         verbose=False)
        au = df.sort_values("documents", ascending=False).iloc[0]
        self._subjects = [a.split(" ")[0] for a in au.areas.split("; ")]
        self._surname = au.surname or None
        self._first_name = au.givenname or None
        name = ", ".join([self._surname or "", au.givenname or ""])
        if name == ", ":
            name = None
        self._name = name
예제 #7
0
# -*- coding: utf-8 -*-
"""Tests for processing.getting module."""

from os.path import expanduser

from nose.tools import assert_equal, assert_true
from pandas import DataFrame

from sosia.establishing import connect_database
from sosia.processing.getting import get_authors

test_cache = expanduser("~/.sosia/test.sqlite")
test_conn = connect_database(test_cache)
refresh = 30


def test_query_authors():
    auth_list = [6701809842, 55208373700]
    auth_data = get_authors(auth_list, test_conn, refresh=refresh)
    assert_true(isinstance(auth_data,  DataFrame))
    expected_cols = ["auth_id", "eid", "surname", "initials", "givenname",
                     "affiliation", "documents", "affiliation_id", "city",
                     "country", "areas"]
    assert_equal(auth_data.columns.tolist(), expected_cols)
    assert_equal(auth_data["auth_id"].tolist(), auth_list)
    assert_equal(auth_data["surname"].tolist(), ["Harhoff", "Baruffaldi"])