def get_authors(authors, conn, refresh=False, verbose=False): """Wrapper function to search author data for a list of authors, searching first in the SQL database and then via stacked search. Parameters ---------- authors : list List of Scopus Author IDs to search. conn : sqlite3 connection Standing connection to a SQLite3 database. refresh : bool (optional, default=False) Whether to refresh scopus cached files if they exist, or not. verbose : bool (optional, default=False) Whether to print information on the search progress. Returns ------- data : DataFrame Data on the provided authors. """ from string import Template # Retrieve existing data from SQL cache authors = pd.DataFrame(authors, columns=["auth_id"], dtype="uint64") data, missing = retrieve_authors(authors, conn) # Query missing records and insert at the same time if missing: params = { "group": missing, "refresh": refresh, "joiner": ") OR AU-ID(", "q_type": "author", "template": Template("AU-ID($fill)"), "stacked": True, "verbose": verbose } if verbose: print("Pre-filtering...") res = stacked_query(**params) res = pd.DataFrame(res) insert_data(res, conn, table="authors") data, _ = retrieve_authors(authors, conn) return data
def get_authors_from_sourceyear(df, conn, refresh=False, stacked=False, verbose=False): """Get authors publishing in specified sourced in specified years. Handles retrieving data, and in case of missing data querying for it and inserting it into the SQL database. Parameters ---------- df : DataFrame DataFrame of source-year-combinations to be searched for. conn : sqlite3 connection Standing connection to an SQLite3 database. refresh : bool (optional, default=False) Whether to refresh cached search files. stacked : bool (optional, default=False) Whether to use fewer queries that are not reusable, or to use modular queries of the form "SOURCE-ID(<SID>) AND PUBYEAR IS <YYYY>". verbose : bool (optional, default=False) Whether to print information on the search progress. Returns ------- data : DataFrame DataFrame in format ("source_id", "year", "auids", "afid"), where entries correspond to an individual paper. """ # Retrieve information in cache data, missing = retrieve_authors_from_sourceyear(df, conn, refresh=refresh) # Download and add missing data to_add = pd.DataFrame() empty = [] for year in missing["year"].unique(): subset = missing[missing["year"] == year] sources = subset["source_id"].unique() new = query_pubs_by_sourceyear(sources, year, refresh=refresh, stacked=stacked, verbose=verbose) no_info = set(sources) - set(new["source_id"].unique()) empty.extend([(s, year) for s in no_info]) to_add = pd.concat([to_add, new]) # Format useful information data = pd.concat([data, to_add]) data = data[data["auids"] != ""] data["auids"] = data["auids"].str.replace(";", ",").str.split(",") # Insert new information and information on missing data if empty: sources, years = list(zip(*empty)) d = { "source_id": sources, "year": years, "auids": [""] * len(sources), "afid": [""] * len(sources) } to_add = pd.concat([to_add, pd.DataFrame(d)]) if not to_add.empty: to_add["auids"] = to_add["auids"].str.replace(";", ",").str.split(",") insert_data(to_add, conn, table="sources_afids") return data
def find_matches(original, stacked, verbose, refresh): """Find matches within the search group. Parameters ---------- original : sosia.Original() The object containing information for the original scientist to search for. Attribute search_group needs to exist. stacked : bool (optional, default=False) Whether to combine searches in few queries or not. Cached files will most likely not be reusable. Set to True if you query in distinct fields or you want to minimize API key usage. verbose : bool (optional, default=False) Whether to report on the progress of the process. refresh : bool (optional, default=False) Whether to refresh cached results (if they exist) or not. If int is passed and stacked=False, results will be refreshed if they are older than that value in number of days. """ # Variables _years = range(original.first_year - original.first_year_margin, original.first_year + original.first_year_margin + 1) _npapers = margin_range(len(original.publications), original.pub_margin) _max_papers = max(_npapers) _ncits = margin_range(original.citations, original.cits_margin) _max_cits = max(_ncits) _ncoauth = margin_range(len(original.coauthors), original.coauth_margin) _max_coauth = max(_ncoauth) if original.period: _npapers = margin_range(len(original.publications_period), original.pub_margin) _ncits = margin_range(original.citations_period, original.cits_margin) _ncoauth = margin_range(len(original.coauthors_period), original.coauth_margin) text = "Searching through characteristics of "\ f"{len(original.search_group):,} authors..." custom_print(text, verbose) conn = original.sql_conn # First round of filtering: minimum publications and main field # create df of authors authors = get_authors(original.search_group, original.sql_conn, verbose=verbose) same_field = authors['areas'].str.startswith(original.main_field[1]) enough_pubs = authors['documents'].astype(int) >= int(min(_npapers)) group = sorted(authors[same_field & enough_pubs]["auth_id"].tolist()) text = f"Left with {len(group):,} authors with sufficient "\ "number of publications and same main field" custom_print(text, verbose) # Second round of filtering: # Check having no publications before minimum year, and if 0, the # number of publications in the relevant period. params = { "group": group, "ybefore": min(_years) - 1, "yupto": original.year, "npapers": _npapers, "yfrom": original._period_year, "verbose": verbose, "conn": conn } group, _, _ = filter_pub_counts(**params) # Screen out profiles with too many publications over the full period if original.period: params.update({ "npapers": [1, _max_papers], "yfrom": None, "group": group }) group, _, _ = filter_pub_counts(**params) text = f"Left with {len(group):,} researchers" custom_print(text, verbose) # Third round of filtering: citations (in the FULL period) authors = pd.DataFrame({"auth_id": group, "year": original.year}) auth_cits, missing = retrieve_author_info(authors, conn, "author_ncits") if not missing.empty: total = missing.shape[0] text = f"Counting citations of {total:,} authors..." custom_print(text, verbose) missing['n_cits'] = 0 print_progress(0, total, verbose) start = 0 for i, au in missing.iterrows(): n_cits = count_citations([str(au['auth_id'])], original.year + 1) missing.at[i, 'n_cits'] = n_cits print_progress(i + 1, total, verbose) if i % 100 == 0 or i == len(missing) - 1: insert_data(missing.iloc[start:i + 1], conn, table="author_ncits") start = i auth_cits = pd.concat([auth_cits, missing]) auth_cits['auth_id'] = auth_cits['auth_id'].astype("uint64") # Keep if citations are in range custom_print("Filtering based on count of citations...", verbose) mask = auth_cits["n_cits"].between(min(_ncits), _max_cits) group = auth_cits[mask]['auth_id'].tolist() # Fourth round of filtering: Download publications, verify coauthors # (in the FULL period) and first year text = f"Left with {len(group):,} authors\nFiltering based on "\ "coauthor count..." custom_print(text, verbose) authors = pd.DataFrame({ "auth_id": group, "year": original.year }, dtype="uint64") _, author_year_search = retrieve_author_info(authors, conn, "author_year") matches = [] if not author_year_search.empty: q = Template(f"AU-ID($fill) AND PUBYEAR BEF {original.year + 1}") auth_year_group = author_year_search["auth_id"].tolist() params = { "group": auth_year_group, "template": q, "refresh": refresh, "joiner": ") OR AU-ID(", "q_type": "docs", "verbose": verbose, "stacked": stacked } res = stacked_query(**params) res = build_dict(res, auth_year_group) if res: # res can become empty after build_dict if a au_id is old res = pd.DataFrame.from_dict(res, orient="index") res["year"] = original.year res = res[["year", "first_year", "n_pubs", "n_coauth"]] res.index.name = "auth_id" res = res.reset_index() insert_data(res, original.sql_conn, table="author_year") authors_year, _ = retrieve_author_info(authors, conn, "author_year") # Check for number of coauthors within margin mask = authors_year["n_coauth"].between(min(_ncoauth), _max_coauth) # Check for year of first publication within range if not original.first_year_name_search: same_start = authors_year["first_year"].between( min(_years), max(_years)) mask = mask & same_start # Filter matches = sorted(authors_year[mask]["auth_id"].tolist()) text = f"Left with {len(matches)} authors" custom_print(text, verbose) if original.period: text = "Filtering based on citations and coauthor count during period..." custom_print(text, verbose) # Further screen matches based on period cits and coauths to_loop = [m for m in matches] # temporary copy for m in to_loop: res = base_query("docs", f"AU-ID({m})", refresh=refresh, fields=["eid", "author_ids", "coverDate"]) pubs = [ p for p in res if original._period_year <= int(p.coverDate[:4]) <= original.year ] coauths = set(extract_authors(pubs)) - {str(m)} if not (min(_ncoauth) <= len(coauths) <= max(_ncoauth)): matches.remove(m) continue eids_period = [p.eid for p in pubs] n_cits = count_citations(eids_period, original.year + 1, [str(m)]) if not (min(_ncits) <= n_cits <= max(_ncits)): matches.remove(m) # Eventually filter on affiliations if original.search_affiliations: text = "Filtering based on affiliations..." custom_print(text, verbose) matches[:] = [ m for m in matches if same_affiliation(original, m, refresh) ] return matches