def stacked_query(group, template, joiner, q_type, refresh, stacked, verbose): """Auxiliary function to query list of items. Parameters ---------- group : list of str Scopus IDs (of authors or sources) for which the stacked query should be conducted. template : string.Template() A string template with one parameter named `fill` which will be used as search query. joiner : str On which the group elements should be joined to fill the query. q_type : str Determines the query search that will be used. Allowed values: "author", "docs". refresh : bool Whether the cached files should be refreshed or not. stacked: bool If True search for queries as close as possible to the maximum length QUERY_MAX_LEN. If False, search elements in group one by one. verbose : bool (optional, default=False) Whether to print information on the search progress. Returns ------- res : list A list of namedtuples representing publications. """ maxlen = 1 if stacked: maxlen = QUERY_MAX_LEN queries = create_queries(group, joiner, template, maxlen) total = len(queries) print_progress(0, total, verbose) res = [] for i, q in enumerate(queries): print_progress(i + 1, total, verbose) res.extend(long_query(q, q_type, template, refresh)) return res
def search_group_from_sources(self, stacked, verbose, refresh=False): """Define groups of authors based on publications from a set of sources. Parameters ---------- self : sosia.Original The object of the Scientist to search information for. verbose : bool (optional, default=False) Whether to report on the progress of the process. refresh : bool (optional, default=False) Whether to refresh cached search files. Returns ------- today, then, negative : set Set of authors publishing in three periods: During the year of treatment, during years to match on, and during years before the first publication. """ # Filtering variables min_year = self.first_year - self.year_margin max_year = self.first_year + self.year_margin if self.period: _margin_setter = self.publications_period else: _margin_setter = self.publications max_pubs = max(margin_range(len(_margin_setter), self.pub_margin)) years = list(range(min_year, max_year + 1)) search_years = [min_year - 1] if not self._ignore_first_id: search_years.extend(range(min_year, max_year + 1)) search_sources, _ = zip(*self.search_sources) # Verbose variables n = len(search_sources) text = "Searching authors for search_group in {} sources...".format(n) custom_print(text, verbose) today = set() then = set() negative = set() if stacked: # Make use of SQL cache # Year provided (select also based on location) # Get already cached sources from cache sources_ay = DataFrame(list(product(search_sources, [self.active_year])), columns=["source_id", "year"]) _, _search = sources_in_cache(sources_ay, refresh=refresh, afid=True) res = query_year(self.active_year, _search.source_id.tolist(), refresh, verbose, afid=True) cache_insert(res, table="sources_afids") sources_ay, _ = sources_in_cache(sources_ay, refresh=refresh, afid=True) # Authors publishing in provided year and locations mask = None if self.search_affiliations: mask = sources_ay.afid.isin(self.search_affiliations) today = flat_set_from_df(sources_ay, "auids", mask) # Years before active year # Get already cached sources from cache sources_ys = DataFrame(list(product(search_sources, search_years)), columns=["source_id", "year"]) _, sources_ys_search = sources_in_cache(sources_ys, refresh=refresh) missing_years = set(sources_ys_search.year.tolist()) # Eventually add information for missing years to cache for y in missing_years: mask = sources_ys_search.year == y _sources_search = sources_ys_search[mask].source_id.tolist() res = query_year(y, _sources_search, refresh, verbose) cache_insert(res, table="sources") # Get full cache sources_ys, _ = sources_in_cache(sources_ys, refresh=False) # Authors publishing in year(s) of first publication if not self._ignore_first_id: mask = sources_ys.year.between(min_year, max_year, inclusive=True) then = flat_set_from_df(sources_ys, "auids", mask) # Authors with publications before mask = sources_ys.year < min_year negative = flat_set_from_df(sources_ys, "auids", mask) else: auth_count = [] print_progress(0, n, verbose) for i, source_id in enumerate(search_sources): info = query_journal(source_id, [self.active_year] + years, refresh) today.update(info[str(self.active_year)]) if not self._ignore_first_id: for y in years: then.update(info[str(y)]) for y in range(int(min(info.keys())), min_year): negative.update(info[str(y)]) for y in info: if int(y) <= self.active_year: auth_count.extend(info[str(y)]) print_progress(i + 1, n, verbose) c = Counter(auth_count) negative.update({a for a, npub in c.items() if npub > max_pubs}) return today, then, negative
def filter_pub_counts(group, ybefore, yupto, npapers, yfrom=None, verbose=False): """Filter authors based on restrictions in the number of publications in different periods, searched by query_size. Parameters ---------- group : list of str Scopus IDs of authors to be filtered. ybefore : int Year to be used as first year. Publications on this year and before need to be 0. yupto : int Year up to which to count publications. npapers : list List of count of publications, minimum and maximum. yfrom : int (optional, default=None) If provided, publications are counted only after this year. Publications are still set to 0 before ybefore. Returns ------- group : list of str Scopus IDs filtered. pubs_counts : list of int List of count of publications within the period provided for authors in group. older_authors : list of str Scopus IDs filtered out because have publications before ybefore. Notes ----- It uses cached values first, and searches for more data if needed. """ group = [int(x) for x in group] years_check = [ybefore, yupto] if yfrom: years_check.extend([yfrom - 1]) authors = DataFrame(list(product(group, years_check)), columns=["auth_id", "year"], dtype="int64") authors_size = author_size_in_cache(authors) au_skip = [] group_tocheck = [x for x in group] older_authors = [] pubs_counts = [] # use information in cache if not authors_size.empty: # authors that can be already removed because older mask = ((authors_size.year <= ybefore) & (authors_size.n_pubs > 0)) remove = (authors_size[mask]["auth_id"].drop_duplicates().tolist()) older_authors.extend(remove) au_remove = [x for x in remove] # remove if number of pubs in year is in any case too small mask = ((authors_size.year >= yupto) & (authors_size.n_pubs < min(npapers))) remove = (authors_size[mask]["auth_id"].drop_duplicates().tolist()) au_remove.extend(remove) # authors with no pubs before min year mask = (((authors_size.year == ybefore) & (authors_size.n_pubs == 0))) au_ok_miny = (authors_size[mask]["auth_id"].drop_duplicates().tolist()) # check publications in range if yfrom: # adjust count by substracting the count before period; keep # only authors for which it is possible mask = (authors_size.year == yfrom - 1) authors_size_bef = authors_size[mask] authors_size_bef["year"] = yupto authors_size_bef.columns = ["auth_id", "year", "n_pubs_bef"] bef_auth = set(authors_size_bef["auth_id"]) mask = ((authors_size["auth_id"].isin(bef_auth)) & (authors_size["year"] == yupto)) authors_size = authors_size[mask] authors_size = authors_size.merge(authors_size_bef, "left", on=["auth_id", "year"]) authors_size = authors_size.fillna(0) authors_size["n_pubs"] -= authors_size["n_pubs_bef"] # authors that can be already removed because of pubs count mask = (((authors_size.year >= yupto) & (authors_size.n_pubs < min(npapers))) | ((authors_size.year <= yupto) & (authors_size.n_pubs > max(npapers)))) remove = (authors_size[mask]["auth_id"].drop_duplicates().tolist()) au_remove.extend(remove) # authors with pubs count within the range before the given year mask = (((authors_size.year == yupto) & (authors_size.n_pubs >= min(npapers))) & (authors_size.n_pubs <= max(npapers))) au_ok_year = authors_size[mask][["auth_id", "n_pubs"]].drop_duplicates() # authors ok (match both conditions) au_ok = list(set(au_ok_miny).intersection(set(au_ok_year["auth_id"]))) mask = au_ok_year["auth_id"].isin(au_ok) pubs_counts = au_ok_year[mask]["n_pubs"].tolist() # authors that match only the first condition, but the second is # not known, can skip the first cindition check. au_skip = [x for x in au_ok_miny if x not in au_remove + au_ok] group = [x for x in group if x not in au_remove] group_tocheck = [x for x in group if x not in au_skip + au_ok] text = "Left with {} authors based on size information already in "\ "cache.\n{} to check\n".format(len(group), len(group_tocheck)) custom_print(text, verbose) # Verify that publications before minimum year are 0 if group_tocheck: text = "Searching through characteristics of {:,} authors...".format( len(group_tocheck)) custom_print(text, verbose) print_progress(0, len(group_tocheck), verbose) to_loop = [x for x in group_tocheck] # Temporary copy for i, au in enumerate(to_loop): q = "AU-ID({}) AND PUBYEAR BEF {}".format(au, ybefore + 1) size = base_query("docs", q, size_only=True) tp = (au, ybefore, size) cache_insert(tp, table="author_size") print_progress(i + 1, len(to_loop), verbose) if not size == 0: group.remove(au) group_tocheck.remove(au) older_authors.append(au) text = "Left with {} authors based on size information before "\ "minium year\n Filtering based on size query before "\ "provided year\n".format(len(group)) custom_print(text, verbose) # Verify that publications before the given year falle in range group_tocheck.extend(au_skip) n = len(group_tocheck) if group_tocheck: text = "Searching through characteristics of {:,} authors".format(n) custom_print(text, verbose) print_progress(0, n, verbose) for i, au in enumerate(group_tocheck): q = "AU-ID({}) AND PUBYEAR BEF {}".format(au, yupto + 1) n_pubs_yupto = base_query("docs", q, size_only=True) tp = (au, yupto, n_pubs_yupto) cache_insert(tp, table="author_size") # Eventually decrease publication count if yfrom and n_pubs_yupto >= min(npapers): q = "AU-ID({}) AND PUBYEAR BEF {}".format(au, yfrom) n_pubs_yfrom = base_query("docs", q, size_only=True) tp = (au, yfrom - 1, n_pubs_yfrom) cache_insert(tp, table="author_size") n_pubs_yupto -= n_pubs_yfrom if n_pubs_yupto < min(npapers) or n_pubs_yupto > max(npapers): group.remove(au) else: pubs_counts.append(n_pubs_yupto) print_progress(i + 1, n, verbose) return group, pubs_counts, older_authors
def find_matches(original, stacked, verbose, refresh): """Find matches within the search group. Parameters ---------- original : sosia.Original() The object containing information for the original scientist to search for. Attribute search_group needs to exist. stacked : bool (optional, default=False) Whether to combine searches in few queries or not. Cached files will most likely not be reusable. Set to True if you query in distinct fields or you want to minimize API key usage. verbose : bool (optional, default=False) Whether to report on the progress of the process. refresh : bool (optional, default=False) Whether to refresh cached results (if they exist) or not. If int is passed and stacked=False, results will be refreshed if they are older than that value in number of days. """ # Variables _years = range(original.first_year - original.first_year_margin, original.first_year + original.first_year_margin + 1) _npapers = margin_range(len(original.publications), original.pub_margin) _max_papers = max(_npapers) _ncits = margin_range(original.citations, original.cits_margin) _max_cits = max(_ncits) _ncoauth = margin_range(len(original.coauthors), original.coauth_margin) _max_coauth = max(_ncoauth) if original.period: _npapers = margin_range(len(original.publications_period), original.pub_margin) _ncits = margin_range(original.citations_period, original.cits_margin) _ncoauth = margin_range(len(original.coauthors_period), original.coauth_margin) text = "Searching through characteristics of "\ f"{len(original.search_group):,} authors..." custom_print(text, verbose) conn = original.sql_conn # First round of filtering: minimum publications and main field # create df of authors authors = get_authors(original.search_group, original.sql_conn, verbose=verbose) same_field = authors['areas'].str.startswith(original.main_field[1]) enough_pubs = authors['documents'].astype(int) >= int(min(_npapers)) group = sorted(authors[same_field & enough_pubs]["auth_id"].tolist()) text = f"Left with {len(group):,} authors with sufficient "\ "number of publications and same main field" custom_print(text, verbose) # Second round of filtering: # Check having no publications before minimum year, and if 0, the # number of publications in the relevant period. params = { "group": group, "ybefore": min(_years) - 1, "yupto": original.year, "npapers": _npapers, "yfrom": original._period_year, "verbose": verbose, "conn": conn } group, _, _ = filter_pub_counts(**params) # Screen out profiles with too many publications over the full period if original.period: params.update({ "npapers": [1, _max_papers], "yfrom": None, "group": group }) group, _, _ = filter_pub_counts(**params) text = f"Left with {len(group):,} researchers" custom_print(text, verbose) # Third round of filtering: citations (in the FULL period) authors = pd.DataFrame({"auth_id": group, "year": original.year}) auth_cits, missing = retrieve_author_info(authors, conn, "author_ncits") if not missing.empty: total = missing.shape[0] text = f"Counting citations of {total:,} authors..." custom_print(text, verbose) missing['n_cits'] = 0 print_progress(0, total, verbose) start = 0 for i, au in missing.iterrows(): n_cits = count_citations([str(au['auth_id'])], original.year + 1) missing.at[i, 'n_cits'] = n_cits print_progress(i + 1, total, verbose) if i % 100 == 0 or i == len(missing) - 1: insert_data(missing.iloc[start:i + 1], conn, table="author_ncits") start = i auth_cits = pd.concat([auth_cits, missing]) auth_cits['auth_id'] = auth_cits['auth_id'].astype("uint64") # Keep if citations are in range custom_print("Filtering based on count of citations...", verbose) mask = auth_cits["n_cits"].between(min(_ncits), _max_cits) group = auth_cits[mask]['auth_id'].tolist() # Fourth round of filtering: Download publications, verify coauthors # (in the FULL period) and first year text = f"Left with {len(group):,} authors\nFiltering based on "\ "coauthor count..." custom_print(text, verbose) authors = pd.DataFrame({ "auth_id": group, "year": original.year }, dtype="uint64") _, author_year_search = retrieve_author_info(authors, conn, "author_year") matches = [] if not author_year_search.empty: q = Template(f"AU-ID($fill) AND PUBYEAR BEF {original.year + 1}") auth_year_group = author_year_search["auth_id"].tolist() params = { "group": auth_year_group, "template": q, "refresh": refresh, "joiner": ") OR AU-ID(", "q_type": "docs", "verbose": verbose, "stacked": stacked } res = stacked_query(**params) res = build_dict(res, auth_year_group) if res: # res can become empty after build_dict if a au_id is old res = pd.DataFrame.from_dict(res, orient="index") res["year"] = original.year res = res[["year", "first_year", "n_pubs", "n_coauth"]] res.index.name = "auth_id" res = res.reset_index() insert_data(res, original.sql_conn, table="author_year") authors_year, _ = retrieve_author_info(authors, conn, "author_year") # Check for number of coauthors within margin mask = authors_year["n_coauth"].between(min(_ncoauth), _max_coauth) # Check for year of first publication within range if not original.first_year_name_search: same_start = authors_year["first_year"].between( min(_years), max(_years)) mask = mask & same_start # Filter matches = sorted(authors_year[mask]["auth_id"].tolist()) text = f"Left with {len(matches)} authors" custom_print(text, verbose) if original.period: text = "Filtering based on citations and coauthor count during period..." custom_print(text, verbose) # Further screen matches based on period cits and coauths to_loop = [m for m in matches] # temporary copy for m in to_loop: res = base_query("docs", f"AU-ID({m})", refresh=refresh, fields=["eid", "author_ids", "coverDate"]) pubs = [ p for p in res if original._period_year <= int(p.coverDate[:4]) <= original.year ] coauths = set(extract_authors(pubs)) - {str(m)} if not (min(_ncoauth) <= len(coauths) <= max(_ncoauth)): matches.remove(m) continue eids_period = [p.eid for p in pubs] n_cits = count_citations(eids_period, original.year + 1, [str(m)]) if not (min(_ncits) <= n_cits <= max(_ncits)): matches.remove(m) # Eventually filter on affiliations if original.search_affiliations: text = "Filtering based on affiliations..." custom_print(text, verbose) matches[:] = [ m for m in matches if same_affiliation(original, m, refresh) ] return matches
def inform_matches(self, keywords, verbose, refresh): """Add match-specific information to all matches. Parameters ---------- self : sosia.Original() Object whose matches should receive additional information. keywords : iterable of strings Which information to add to matches. verbose : bool Whether to report on the progress of the process and the completeness of document information. refresh : bool Whether to refresh all cached files or not. Returns ------- out : list of namedtuples A list of namedtuples representing matches. Provided information depend on provided keywords. """ from sosia.classes import Scientist # Create Match object fields = "ID name " + " ".join(keywords) m = namedtuple("Match", fields) # Preparation doc_parse = "num_cited_refs" in keywords if doc_parse: focal_docs = parse_docs([d.eid for d in self.publications], refresh) focal_refs, focal_refs_n = focal_docs # Add selected information match-by-match out = [] completeness = {} total = len(self.matches) print_progress(0, total, verbose) for idx, auth_id in enumerate(self.matches): period = self.year + 1 - self._period_year p = Scientist([auth_id], self.year, period=period, refresh=refresh, sql_fname=self.sql_fname) match_info = inform_match(p, keywords, refresh=refresh) # Abstract and reference similarity is performed jointly if doc_parse: eids = [d.eid for d in p.publications] refs, refs_n = parse_docs(eids, refresh) completeness[auth_id] = (refs_n, len(eids)) if "num_cited_refs" in keywords: ref_cos = compute_overlap(refs, focal_refs) match_info["num_cited_refs"] = ref_cos out.append(m(**match_info)) print_progress(idx + 1, total, verbose) # Eventually print information on missing information if verbose and doc_parse: for auth_id, completeness in completeness.items(): _print_missing_docs([auth_id], completeness[0], completeness[1]) focal_pubs_n = len(self.publications) _print_missing_docs(self.identifier, focal_refs_n, focal_pubs_n, res_type="Original") return out
def stacked_query(group, res, template, joiner, q_type, refresh, i=0, total=None): """Auxiliary function to recursively perform queries until they work. Parameters ---------- group : list of str Scopus IDs (of authors or sources) for which the stacked query should be conducted. res : list (Initially empty )Container to which the query results will be appended. template : Template() A string template with one paramter named `fill` which will be used as search query. joiner : str On wich the group elements should be joined to fill the query. q_type : str Determines the query search that will be used. Allowed values: "author", "docs". refresh : bool Whether the cached files should be refreshed or not. i : int (optional, default=0) A count variable to be used for printing the progress bar. total : int (optional, default=None) The total number of elements in the group. If provided, a progress bar will be printed. Returns ------- res : list A list of namedtuples representing publications. i : int A running variable to indicate the progress. Notes ----- Results of each successful query are appended to ´res´. """ group = [str(g) for g in group] # make robust to passing int q = template.substitute(fill=joiner.join(group)) try: n = base_query(q_type, q, size_only=True) if n > 5000 and len(group) > 1: raise ScopusQueryError() res.extend(base_query(q_type, q, refresh=refresh)) verbose = total is not None i += len(group) print_progress(i, total, verbose) except (Scopus400Error, Scopus500Error, ScopusQueryError) as e: # Split query group into two equally sized groups mid = len(group) // 2 params = { "group": group[:mid], "res": res, "template": template, "i": i, "joiner": joiner, "q_type": q_type, "total": total, "refresh": refresh } res, i = stacked_query(**params) params.update({"group": group[mid:], "i": i}) res, i = stacked_query(**params) return res, i
def find_matches(self, stacked=False, verbose=False, stop_words=STOPWORDS, information=True, refresh=False, **tfidf_kwds): """Find matches within search_group based on four criteria: 1. Started publishing in about the same year 2. Has about the same number of publications in the year of treatment 3. Has about the same number of coauthors in the year of treatment 4. Has about the same number of citations in the year of treatment 5. Works in the same field as the scientist's main field Parameters ---------- stacked : bool (optional, default=False) Whether to combine searches in few queries or not. Cached files will most likely not be resuable. Set to True if you query in distinct fields or you want to minimize API key usage. verbose : bool (optional, default=False) Whether to report on the progress of the process. stop_words : list (optional, default=STOPWORDS) A list of words that should be filtered in the analysis of abstracts. Default list is the list of english stopwords by nltk, augmented with numbers and interpunctuation. information : bool or iterable (optional, default=True) Whether to return additional information on the matches that may help in the selection process. If an iterable of keywords is provied, only return information for these keywords. Allowed values are "first_year", "num_coauthors", "num_publications", "num_citations", "country", "language", "reference_sim", "abstract_sim". refresh : bool (optional, default=False) Whether to refresh cached search files. tfidf_kwds : keywords Parameters to pass to TfidfVectorizer from the sklearn package for abstract vectorization. Not used when `information=False` or or when "abstract_sim" is not in `information`. See https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html for possible values. Returns ------- matches : list A list of Scopus IDs of scientists matching all the criteria (if information is False) or a list of namedtuples with the Scopus ID and additional information (if information is True). Raises ------ ValueError If information is not bool and contains invalid keywords. """ # Checks info_keys = [ "first_name", "surname", "first_year", "num_coauthors", "num_publications", "num_citations", "num_coauthors_period", "num_publications_period", "num_citations_period", "subjects", "country", "affiliation_id", "affiliation", "language", "reference_sim", "abstract_sim" ] if isinstance(information, bool): if information: keywords = info_keys elif self.search_affiliations: information = True keywords = ["affiliation_id"] else: keywords = None else: keywords = information invalid = [x for x in keywords if x not in info_keys] if invalid: text = ("Parameter information contains invalid keywords: ", ", ".join(invalid)) raise ValueError(text) if self.search_affiliations and "affiliation_id" not in keywords: keywords.append("affiliation_id") # Variables _years = range(self.first_year - self.year_margin, self.first_year + self.year_margin + 1) if self.period: _npapers = margin_range(len(self.publications_period), self.pub_margin) _ncits = margin_range(self.citations_period, self.cits_margin) _ncoauth = margin_range(len(self.coauthors_period), self.coauth_margin) _npapers_full = margin_range(len(self.publications), self.pub_margin) _ncits_full = margin_range(self.citations, self.cits_margin) _ncoauth_full = margin_range(len(self.coauthors), self.coauth_margin) else: _npapers = margin_range(len(self.publications), self.pub_margin) _ncits = margin_range(self.citations, self.cits_margin) _ncoauth = margin_range(len(self.coauthors), self.coauth_margin) n = len(self.search_group) text = "Searching through characteristics of {:,} authors".format(n) custom_print(text, verbose) # First round of filtering: minimum publications and main field # create df of authors authors = query_author_data(self.search_group, verbose=verbose) same_field = (authors.areas.str.startswith(self.main_field[1])) enough_pubs = (authors.documents.astype(int) >= int(min(_npapers))) group = authors[same_field & enough_pubs]["auth_id"].tolist() group.sort() n = len(group) text = "Left with {} authors\nFiltering based on provided "\ "conditions...".format(n) custom_print(text, verbose) # Second round of filtering: # Check having no publications before minimum year, and if 0, the # number of publications in the relevant period. params = { "group": group, "ybefore": min(_years) - 1, "yupto": self.year, "npapers": _npapers, "yfrom": self.year_period, "verbose": verbose } group, _, _ = filter_pub_counts(**params) # Also screen out ids with too many publications over the full period if self.period: params.update({ "npapers": [1, max(_npapers_full)], "yfrom": None, "group": group }) group, _, _ = filter_pub_counts(**params) # Third round of filtering: citations (in the FULL period). authors = pd.DataFrame({"auth_id": group, "year": self.year}) _, authors_cits_search = author_cits_in_cache(authors) text = "Search and filter based on count of citations\n{} to search "\ "out of {}\n".format(len(authors_cits_search), len(group)) custom_print(text, verbose) if not authors_cits_search.empty: authors_cits_search['n_cits'] = 0 print_progress(0, len(authors_cits_search), verbose) for i, au in authors_cits_search.iterrows(): q = "REF({}) AND PUBYEAR BEF {} AND NOT AU-ID({})".format( au['auth_id'], self.year + 1, au['auth_id']) n = base_query("docs", q, size_only=True) authors_cits_search.at[i, 'n_cits'] = n print_progress(i + 1, len(authors_cits_search), verbose) cache_insert(authors_cits_search, table="author_cits_size") auth_cits_incache, _ = author_cits_in_cache( authors[["auth_id", "year"]]) # keep if citations are in range mask = ((auth_cits_incache.n_cits <= max(_ncits)) & (auth_cits_incache.n_cits >= min(_ncits))) if self.period: mask = ((auth_cits_incache.n_cits >= min(_ncits)) & (auth_cits_incache.n_cits <= max(_ncits_full))) group = (auth_cits_incache[mask]['auth_id'].tolist()) # Fourth round of filtering: Download publications, verify coauthors # (in the FULL period) and first year. n = len(group) text = "Left with {} authors\nFiltering based on coauthors "\ "number...".format(n) custom_print(text, verbose) authors = pd.DataFrame({ "auth_id": group, "year": self.year }, dtype="uint64") _, author_year_search = author_year_in_cache(authors) matches = [] if stacked: # Combine searches if not author_year_search.empty: q = Template( "AU-ID($fill) AND PUBYEAR BEF {}".format(self.year + 1)) auth_year_group = author_year_search.auth_id.tolist() params = { "group": auth_year_group, "res": [], "template": q, "refresh": refresh, "joiner": ") OR AU-ID(", "q_type": "docs" } if verbose: params.update({"total": len(auth_year_group)}) res, _ = stacked_query(**params) res = build_dict(res, auth_year_group) if res: # res can become empty after build_dict if a au_id is old res = pd.DataFrame.from_dict(res, orient="index") res["year"] = self.year res = res[["year", "first_year", "n_pubs", "n_coauth"]] res.index.name = "auth_id" res = res.reset_index() cache_insert(res, table="author_year") author_year_cache, _ = author_year_in_cache(authors) if self._ignore_first_id: # only number of coauthors should be big enough enough = (author_year_cache.n_coauth >= min(_ncoauth)) notoomany = (author_year_cache.n_coauth <= max(_ncoauth_full)) mask = enough & notoomany elif self.period: # number of coauthors should be "big enough" and first year in # window same_start = (author_year_cache.first_year.between( min(_years), max(_years))) enough = (author_year_cache.n_coauth >= min(_ncoauth)) notoomany = (author_year_cache.n_coauth <= max(_ncoauth_full)) mask = same_start & enough & notoomany else: # all restrictions apply same_start = (author_year_cache.first_year.between( min(_years), max(_years))) same_coauths = (author_year_cache.n_coauth.between( min(_ncoauth), max(_ncoauth))) mask = same_start & same_coauths matches = author_year_cache[mask]["auth_id"].tolist() else: # Query each author individually for i, au in enumerate(group): print_progress(i + 1, len(group), verbose) res = base_query("docs", "AU-ID({})".format(au), refresh=refresh) res = [ p for p in res if p.coverDate and int(p.coverDate[:4]) <= self.year ] # Filter min_year = int(min([p.coverDate[:4] for p in res])) authids = [p.author_ids for p in res if p.author_ids] authors = set([a for p in authids for a in p.split(";")]) n_coauth = len(authors) - 1 # Subtract 1 for focal author if self._ignore_first_id and (n_coauth < max(_ncoauth)): # only number of coauthors should be big enough continue elif (self.period and ((n_coauth < max(_ncoauth)) or (min_year not in _years))): # number of coauthors should be "big enough" and first year # in window continue elif ((len(res) not in _npapers) or (min_year not in _years) or (n_coauth not in _ncoauth)): continue matches.append(au) if self.period: text = "Left with {} authors\nFiltering based on exact period "\ "citations and coauthors...".format(len(matches)) custom_print(text, verbose) # Further screen matches based on period cits and coauths to_loop = [m for m in matches] # temporary copy for m in to_loop: q = "AU-ID({})".format(m) res = base_query("docs", "AU-ID({})".format(m), refresh=refresh, fields=["eid", "author_ids", "coverDate"]) pubs = [ p for p in res if int(p.coverDate[:4]) <= self.year and int(p.coverDate[:4]) >= self.year_period ] coauths = set(get_authors(pubs)) - {str(m)} if not (min(_ncoauth) <= len(coauths) <= max(_ncoauth)): matches.remove(m) continue eids_period = [p.eid for p in pubs] cits = count_citations(search_ids=eids_period, pubyear=self.year + 1, exclusion_key="AU-ID", exclusion_ids=[str(m)]) if not (min(_ncits) <= cits <= max(_ncits)): matches.remove(m) text = "Found {:,} author(s) matching all criteria".format( len(matches)) custom_print(text, verbose) # Possibly add information to matches if keywords and len(matches) > 0: custom_print("Providing additional information...", verbose) profiles = [ Scientist([str(a)], self.year, period=self.period, refresh=refresh) for a in matches ] matches = inform_matches(profiles, self, keywords, stop_words, verbose, refresh, **tfidf_kwds) if self.search_affiliations: matches = [ m for m in matches if len( set(m.affiliation_id.replace(" ", "").split(";")). intersection([str(a) for a in self.search_affiliations])) ] return matches
def inform_matches(self, keywords, stop_words, verbose, refresh, **kwds): """Add match-specific information to all matches. Parameters ---------- self : sosia.Original() Object whose matches should received additional information keywords : iterable of strings Which information to add to matches. stop_words : list A list of words that should be filtered in the analysis of abstracts. verbose : bool Whether to report on the progress of the process and the completeness of document information. refresh : bool Whether to refresh all cached files or not. kwds : keywords Parameters to pass to sklearn.feature_extraction.text.TfidfVectorizer for abstract and reference vectorization. Returns ------- out : list of namedtuples A list of namedtuples representing matches. Provided information depend on provided keywords. """ from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS from string import digits, punctuation from sosia.classes import Scientist # Create Match object fields = "ID name " + " ".join(keywords) m = namedtuple("Match", fields) # Preparation doc_parse = "reference_sim" in keywords or "abstract_sim" in keywords if doc_parse: focal_docs = parse_docs([d.eid for d in self.publications], refresh) focal_refs, focal_refs_n, focal_abs, focal_abs_n = focal_docs if not stop_words: stop_words = list(ENGLISH_STOP_WORDS) + list(punctuation + digits) # Add selected information match-by-match out = [] completeness = {} total = len(self.matches) print_progress(0, total, verbose) meta = namedtuple("Meta", "refs absts total") for idx, auth_id in enumerate(self.matches): period = self.year + 1 - self._period_year p = Scientist([auth_id], self.year, period=period, refresh=refresh, sql_fname=self.sql_fname) match_info = inform_match(p, keywords) # Abstract and reference similarity is performed jointly if doc_parse: eids = [d.eid for d in p.publications] refs, refs_n, absts, absts_n = parse_docs(eids, refresh) completeness[auth_id] = meta(refs=refs_n, absts=absts_n, total=len(eids)) if "reference_sim" in keywords: ref_cos = compute_similarity(refs, focal_refs, **kwds) match_info["reference_sim"] = ref_cos if "abstract_sim" in keywords: kwds.update({"stop_words": stop_words}) abs_cos = compute_similarity(absts, focal_abs, tokenize=True, **kwds) match_info["abstract_sim"] = abs_cos out.append(m(**match_info)) print_progress(idx + 1, total, verbose) # Eventually print information on missing information if verbose and doc_parse: for auth_id, completeness in completeness.items(): _print_missing_docs([auth_id], completeness.absts, completeness.refs, completeness.total) focal_pubs_n = len(self.publications) _print_missing_docs(self.identifier, focal_abs_n, focal_refs_n, focal_pubs_n, res_type="Original") return out
def inform_matches(profiles, focal, keywords, stop_words, verbose, refresh, **kwds): """Create namedtuple adding information to matches. Parameters ---------- profiles : list of Scientist() A list of Scientist objects representing matches. focal : Scientist Object of class Scientist representing the focal scientist. keywords : iterable of strings Which information to add to matches. stop_words : list A list of words that should be filtered in the analysis of abstracts. verbose : bool Whether to report on the progress of the process and the completeness of document information. refresh : bool Whether to refresh all cached files or not. kwds : keywords Parameters to pass to sklearn.feature_extraction.text.TfidfVectorizer for abstract and reference vectorization. Returns ------- m : list of namedtuples A list of namedtuples representing matches. Provided information depend on provided keywords. """ from sosia.classes import Scientist # Create Match object fields = "ID name " + " ".join(keywords) m = namedtuple("Match", fields) # Preparation doc_parse = "reference_sim" in keywords or "abstract_sim" in keywords total = len(profiles) print_progress(0, total, verbose) if doc_parse: focal_eids = [d.eid for d in focal.publications] focal = parse_docs(focal_eids, refresh) focal_refs, focal_refs_n, focal_abs, focal_abs_n = focal # Add selective information out = [] info = {} # to collect information on missing information for idx, p in enumerate(profiles): # Add characteristics match_info = {"ID": p.identifier[0], "name": p.name} if "language" in keywords: try: match_info["language"] = p.get_publication_languages().language except Scopus404Error: # Refresh profile p = Scientist(p.identifier, p.year, refresh=True) match_info["language"] = p.get_publication_languages().language if "first_name" in keywords: match_info["first_name"] = p.first_name if "surname" in keywords: match_info["surname"] = p.surname if "first_year" in keywords: match_info["first_year"] = p.first_year if "num_coauthors" in keywords: match_info["num_coauthors"] = len(p.coauthors) if "num_publications" in keywords: match_info["num_publications"] = len(p.publications) if "num_citations" in keywords: match_info["num_citations"] = p.citations if "num_coauthors_period" in keywords: match_info["num_coauthors_period"] = len(p.coauthors_period) if "num_publications_period" in keywords: match_info["num_publications_period"] = len(p.publications_period) if "num_citations_period" in keywords: match_info["num_citations_period"] = p.citations_period if "subjects" in keywords: match_info["subjects"] = p.subjects if "country" in keywords: match_info["country"] = p.country if "city" in keywords: match_info["city"] = p.city if "affiliation_id" in keywords: match_info["affiliation_id"] = p.affiliation_id if "affiliation" in keywords: match_info["affiliation"] = p.organization # Abstract and reference similiarity is performed jointly if doc_parse: eids = [d.eid for d in p.publications] refs, refs_n, absts, absts_n = parse_docs(eids, refresh) vec = TfidfVectorizer(**kwds) ref_cos = compute_cos(vec.fit_transform([refs, focal_refs])) vec = TfidfVectorizer(stop_words=stop_words, tokenizer=tokenize_and_stem, **kwds) abs_cos = compute_cos(vec.fit_transform([absts, focal_abs])) # Save info for below print statement meta = namedtuple("Meta", "refs absts total") meta(refs=refs_n, absts=absts_n, total=len(eids)) key = "; ".join(p.identifier) info[key] = meta(refs=refs_n, absts=absts_n, total=len(eids)) if "reference_sim" in keywords: match_info["reference_sim"] = ref_cos if "abstract_sim" in keywords: match_info["abstract_sim"] = abs_cos # Finalize out.append(m(**match_info)) print_progress(idx + 1, total, verbose) # Print information on missing information if verbose and doc_parse: for auth_id, info in info.items(): _print_missing_docs(auth_id, info.refs, info.absts, info.total) label = ";".join(focal.identifier) + " (focal)" focal_pubs_n = len(focal.publications) _print_missing_docs(label, focal_refs_n, focal_abs_n, focal_pubs_n) return out
def filter_pub_counts(group, conn, ybefore, yupto, npapers, yfrom=None, verbose=False): """Filter authors based on restrictions in the number of publications in different periods, searched by query_size. Parameters ---------- conn : sqlite3 connection Standing connection to a SQLite3 database. group : list of str Scopus IDs of authors to be filtered. ybefore : int Year to be used as first year. Publications on this year and before need to be 0. yupto : int Year up to which to count publications. npapers : list List of count of publications, minimum and maximum. yfrom : int (optional, default=None) If provided, publications are counted only after this year. Publications are still set to 0 before ybefore. verbose : bool (optional, default=False) Whether to print information on the search progress. Returns ------- group : list of str Scopus IDs filtered. pubs_counts : list of int List of count of publications within the period provided for authors in group. older_authors : list of str Scopus IDs filtered out because have publications before ybefore. """ from itertools import product from pandas import DataFrame group = [int(x) for x in group] years_check = [ybefore, yupto] if yfrom: years_check.extend([yfrom - 1]) authors = DataFrame(product(group, years_check), dtype="uint64", columns=["auth_id", "year"]) auth_npubs, _ = retrieve_author_info(authors, conn, "author_pubs") au_skip = [] group_tocheck = set(group) older_authors = [] pubs_counts = [] # Use information in database if not auth_npubs.empty: # Remove authors based on age mask = ((auth_npubs["year"] <= ybefore) & (auth_npubs["n_pubs"] > 0)) au_remove = set(auth_npubs[mask]["auth_id"].unique()) older_authors.extend(au_remove) # Remove if number of pubs in year is in any case too small mask = ((auth_npubs["year"] >= yupto) & (auth_npubs["n_pubs"] < min(npapers))) au_remove.update(auth_npubs[mask]["auth_id"]) # Authors with no pubs before min year mask = ((auth_npubs["year"] == ybefore) & (auth_npubs["n_pubs"] == 0)) au_ok_miny = set(auth_npubs[mask]["auth_id"].unique()) # Check publications in range if yfrom: # Keep authors where subtracting publications from before period # from publication count is possible mask = auth_npubs["year"] == yfrom - 1 rename = {"n_pubs": "n_pubs_bef"} auth_npubs_bef = auth_npubs[mask].copy().rename(columns=rename) auth_npubs_bef["year"] = yupto auth_npubs = (auth_npubs.merge(auth_npubs_bef, "inner", on=["auth_id", "year"]).fillna(0)) auth_npubs["n_pubs"] -= auth_npubs["n_pubs_bef"] # Remove authors because of their publication count mask = (((auth_npubs["year"] >= yupto) & (auth_npubs["n_pubs"] < min(npapers))) | ((auth_npubs["year"] <= yupto) & (auth_npubs["n_pubs"] > max(npapers)))) remove = auth_npubs[mask]["auth_id"] au_remove.update(remove) # Authors with pubs count within the range before the given year mask = (((auth_npubs["year"] == yupto) & (auth_npubs["n_pubs"] >= min(npapers))) & (auth_npubs["n_pubs"] <= max(npapers))) au_ok_year = auth_npubs[mask][["auth_id", "n_pubs"]].drop_duplicates() # Keep authors that match both conditions au_ok = au_ok_miny.intersection(au_ok_year["auth_id"].unique()) mask = au_ok_year["auth_id"].isin(au_ok) pubs_counts = au_ok_year[mask]["n_pubs"].tolist() # Skip citation check for authors that match only the first condition, # with the second being unknown au_skip = set([x for x in au_ok_miny if x not in au_remove | au_ok]) group = [x for x in group if x not in au_remove] group_tocheck = set([x for x in group if x not in au_skip | au_ok]) # Verify that publications before minimum year are 0 if group_tocheck: n = len(group_tocheck) text = f"Obtaining information for {n:,} authors without sufficient "\ "information in database..." custom_print(text, verbose) print_progress(0, n, verbose) to_loop = [x for x in group_tocheck] # Temporary copy for i, auth_id in enumerate(to_loop): npubs_ybefore = auth_npubs_retrieve_insert(auth_id, ybefore, conn) if npubs_ybefore: group.remove(auth_id) group_tocheck.remove(auth_id) older_authors.append(auth_id) print_progress(i + 1, n, verbose) text = f"Left with {len(group):,} authors based on publication "\ f"information before {ybefore}" custom_print(text, verbose) # Verify that publications before the given year fall in range group_tocheck.update(au_skip) if group_tocheck: n = len(group_tocheck) text = f"Counting publications of {n:,} authors before {yupto+1}..." custom_print(text, verbose) print_progress(0, n, verbose) for i, au in enumerate(group_tocheck): n_pubs_yupto = auth_npubs_retrieve_insert(au, yupto, conn) # Eventually decrease publication count if yfrom and n_pubs_yupto >= min(npapers): n_pubs_yfrom = auth_npubs_retrieve_insert(au, yfrom - 1, conn) n_pubs_yupto -= n_pubs_yfrom if n_pubs_yupto < min(npapers) or n_pubs_yupto > max(npapers): group.remove(au) else: pubs_counts.append(n_pubs_yupto) print_progress(i + 1, n, verbose) return group, pubs_counts, older_authors