def test_retrieve_authors_from_sourceyear(): make_database(test_cache, drop=True) conn = connect_database(test_cache) # Variables expected_sources = [22900] expected_years = [2005, 2010] df = pd.DataFrame(product(expected_sources, expected_years), columns=["source_id", "year"], dtype="int64") # Populate cache expected = query_pubs_by_sourceyear(expected_sources, expected_years[0], refresh=refresh) expected["source_id"] = expected["source_id"].astype(np.int64) expected["afid"] = expected["afid"].astype(int).astype(str) expected = expected.sort_values(["auids", "afid"]).reset_index(drop=True) expected = expected[['source_id', 'year', 'auids', 'afid']] expected["auids"] = expected["auids"].str.split(";") insert_data(expected, conn, table="sources_afids") # Retrieve from cache incache, missing = retrieve_authors_from_sourceyear(df, conn) incache["afid"] = incache["afid"].astype(int).astype(str) incache = incache.sort_values(["auids", "afid"]).reset_index(drop=True) assert_frame_equal(incache, expected) assert_frame_equal(missing, df.tail(1).reset_index(drop=True))
def test_retrieve_author_info_authoryear(): make_database(test_cache, drop=True) conn = connect_database(test_cache) # Variables table = "author_year" expected_auth = [53164702100, 57197093438] search_auth = [55317901900] year = 2016 df2 = pd.DataFrame(expected_auth + search_auth, columns=["auth_id"], dtype="int64") df2["year"] = year # Insert data fill = robust_join(expected_auth, sep=') OR AU-ID(') q = f"(AU-ID({fill})) AND PUBYEAR BEF {year+1}" d = build_dict(ScopusSearch(q, refresh=refresh).results, expected_auth) expected = pd.DataFrame.from_dict(d, orient="index", dtype="int64") expected = expected.sort_index().rename_axis('auth_id').reset_index() expected["year"] = year expected = expected[[ 'auth_id', 'year', 'first_year', 'n_pubs', 'n_coauth' ]] insert_data(expected, conn, table=table) # Retrieve data incache, missing = retrieve_author_info(df2, conn, table) assert_frame_equal(incache, expected) assert_equal(missing['auth_id'].tolist(), search_auth) assert_equal(missing['year'].tolist(), [year])
def test_retrieve_authors(): make_database(test_cache, drop=True) conn = connect_database(test_cache) # Variables expected_auth = [53164702100, 57197093438] df = pd.DataFrame(expected_auth, columns=["auth_id"], dtype="int64") expected_cols = [ 'auth_id', 'eid', 'surname', 'initials', 'givenname', 'affiliation', 'documents', 'affiliation_id', 'city', 'country', 'areas' ] # Retrieve data incache, missing = retrieve_authors(df, conn) assert_equal(incache.shape[0], 0) assert_equal(incache.columns.to_list(), expected_cols) assert_equal(missing, expected_auth)
def test_retrieve_author_info_authorncits(): make_database(test_cache, drop=True) conn = connect_database(test_cache) # Variables table = "author_ncits" data = { "auth_id": [53164702100, 53164702100], "year": [2010, 2017], "n_cits": [0, 6] } expected = pd.DataFrame(data, dtype="int64") # Insert data insert_data(expected, conn, table=table) # Retrieve data cols = ["auth_id", "year"] incache, tosearch = retrieve_author_info(expected[cols], conn, table) assert_frame_equal(incache, expected) assert_true(tosearch.empty)
def test_retrieve_authors_insert(): make_database(test_cache, drop=True) conn = connect_database(test_cache) # Variables expected_auth = [53164702100, 57197093438] search_auth = [55317901900] expected_cols = [ 'auth_id', 'eid', 'surname', 'initials', 'givenname', 'affiliation', 'documents', 'affiliation_id', 'city', 'country', 'areas' ] # Insert data q = f"AU-ID({robust_join(expected_auth, sep=') OR AU-ID(')})" res = pd.DataFrame(AuthorSearch(q, refresh=refresh).authors, dtype="int64") res["auth_id"] = res["eid"].str.split("-").str[-1] res = res[expected_cols] insert_data(res, conn, table="authors") # Retrieve data df = pd.DataFrame(expected_auth + search_auth, columns=["auth_id"], dtype="int64") incache, missing = retrieve_authors(df, conn) assert_equal(incache.shape[0], 2) assert_equal(missing, [55317901900])
def __init__(self, identifier, year, refresh=False, period=None, eids=None, sql_fname=None): """Class to represent a scientist. Parameters ---------- identifier : list of int List of Scopus Author IDs of the scientist. year : str or numeric Year for which characteristics should be defined for. refresh : boolean or int (optional, default=False) Whether to refresh cached results (if they exist) or not. If int is passed, results will be refreshed if they are older than that value in number of days. eids : list (optional, default=None) A list of scopus EIDs of the publications of the scientist. If it is provided, the scientist's properties are set based on these publications, instead of the list of publications obtained from the Scopus Author ID(s). period: int (optional, default=None) In additional starting x years prior to the treatment year, which is also used to compute characteristics in the treatment year. sql_fname : str (optional, default=None) The path of the SQLite database to connect to. If None, will use the path specified in config.ini. Raises ------ Exception When there are no publications for the author until the provided year. """ self.identifier = identifier self.year = int(year) if not sql_fname: sql_fname = config.get('Filepaths', 'Database') self.sql_conn = connect_database(sql_fname) # Read mapping of fields to sources df, names = read_fields_sources_list() self.field_source = df self.source_names = names.set_index("source_id")["title"].to_dict() # Load list of publications if eids: q = f"EID({' OR '.join(eids)})" else: q = f"AU-ID({') OR AU-ID('.join([str(i) for i in identifier])})" integrity_fields = ["eid", "author_ids", "coverDate", "source_id"] res = base_query("docs", q, refresh, fields=integrity_fields) self._publications = [p for p in res if int(p.coverDate[:4]) <= year] if not len(self._publications): text = "No publications found for author "\ f"{'-'.join([str(i) for i in identifier])} until {year}" raise Exception(text) self._eids = eids or [p.eid for p in self._publications] # First year of publication pub_years = [p.coverDate[:4] for p in self._publications] self._first_year = int(min(pub_years)) self._period_year = self.year - (period or (self.year + 1)) + 1 if self._period_year < self._first_year: self._period_year = 0 # Count of citations search_ids = eids or identifier self._citations = count_citations(search_ids, self.year + 1, identifier) # Coauthors self._coauthors = set(extract_authors( self._publications)) - set(identifier) # Period counts simply set to total if period is or goes back to None if self._period_year: pubs = [ p for p in self._publications if self._period_year <= int(p.coverDate[:4]) <= year ] self._publications_period = pubs if not len(self._publications_period): text = "No publications found for author "\ f"{'-'.join([str(i) for i in identifier])} until "\ f"{year} in a {self._period_year}-years period" raise Exception(text) eids_period = [p.eid for p in self._publications_period] n_cits = count_citations(eids_period, self.year + 1, identifier) self._citations_period = n_cits self._coauthors_period = set( extract_authors(self._publications_period)) self._coauthors_period -= set(identifier) else: self._coauthors_period = None self._publications_period = None self._citations_period = None # Author search information source_ids = set( [int(p.source_id) for p in self._publications if p.source_id]) self._sources = add_source_names(source_ids, self.source_names) self._active_year = int(max(pub_years)) mask = df["source_id"].isin(source_ids) self._fields = df[mask]["asjc"].astype(int).tolist() self._main_field = get_main_field(self._fields) if not self._main_field[0]: text = "Not possible to determine research field(s) of "\ "researcher. Functionality is reduced." warn(text, UserWarning) # Most recent geolocation afid = find_main_affiliation(identifier, self._publications, year) self._affiliation_id = afid try: aff = AffiliationRetrieval(afid, refresh=refresh) self._affiliation_country = aff.country self._affiliation_name = aff.affiliation_name self._affiliation_type = aff.org_type except (Scopus404Error, ValueError): self._affiliation_country = None self._affiliation_name = None self._affiliation_type = None self._language = None # Author name from profile with most documents df = get_authors(self.identifier, self.sql_conn, refresh=refresh, verbose=False) au = df.sort_values("documents", ascending=False).iloc[0] self._subjects = [a.split(" ")[0] for a in au.areas.split("; ")] self._surname = au.surname or None self._first_name = au.givenname or None name = ", ".join([self._surname or "", au.givenname or ""]) if name == ", ": name = None self._name = name
# -*- coding: utf-8 -*- """Tests for processing.getting module.""" from os.path import expanduser from nose.tools import assert_equal, assert_true from pandas import DataFrame from sosia.establishing import connect_database from sosia.processing.getting import get_authors test_cache = expanduser("~/.sosia/test.sqlite") test_conn = connect_database(test_cache) refresh = 30 def test_query_authors(): auth_list = [6701809842, 55208373700] auth_data = get_authors(auth_list, test_conn, refresh=refresh) assert_true(isinstance(auth_data, DataFrame)) expected_cols = ["auth_id", "eid", "surname", "initials", "givenname", "affiliation", "documents", "affiliation_id", "city", "country", "areas"] assert_equal(auth_data.columns.tolist(), expected_cols) assert_equal(auth_data["auth_id"].tolist(), auth_list) assert_equal(auth_data["surname"].tolist(), ["Harhoff", "Baruffaldi"])