def get_country(aff_id, refresh=350): """Get country of an affiliation.""" try: country = _aff_countries[aff_id] except KeyError: try: aff = ContentAffiliationRetrieval(aff_id, refresh=refresh) country = aff.country or "Unknown" except ScopusException: country = "Unknown" if aff_id.startswith("6") and country == "Unknown": _aff_missing_countries.add(aff_id) country = _country_map.get(country, country) _aff_countries[aff_id] = country return country
def get_type(aff_ids, refresh=350): """Return types of affiliations recorded by Scopus.""" out = [] for aff_id in aff_ids: # Use parsed information or load new information try: aff_type = _aff_types[aff_id] except KeyError: if aff_id.startswith("1"): aff_type = "?" else: try: aff = ContentAffiliationRetrieval(aff_id, refresh=refresh) aff_type = aff.org_type.split("|")[0] aff_type = _aff_map.get(aff_type, aff_type) except (AttributeError, ScopusException): aff_type = "?" _aff_types[aff_id] = aff_type out.append(aff_type) return tuple(sorted(out, reverse=True))
#!/usr/bin/env python # -*- coding: utf-8 -*- """Tests for `scopus.ContentAffiliationRetrieval` module.""" from nose.tools import assert_equal, assert_true from pybliometrics.scopus import ContentAffiliationRetrieval aff = ContentAffiliationRetrieval('60000356', refresh=True) def test_address(): assert_equal(aff.address, 'Private Bag X3') def test_affiliation_name(): assert_equal(aff.affiliation_name, 'University of Cape Town') def test_author_count(): expected = '10951' assert_true(aff.author_count >= expected) def test_city(): assert_equal(aff.city, 'Cape Town') def test_country(): assert_equal(aff.country, 'South Africa')
def search_scopus(query, docs=None, retrieve_orcid=True): """Search Scopus.""" documents = [] authors_cache = {} affiliations_cache = {} try: retrieved_paper_ids = ScopusSearch(query, view="STANDARD").get_eids() except ScopusQueryError: print("Impossible to process query \"{}\".".format(query)) return None if len(retrieved_paper_ids) == 0: print("No matching documents for the provided query.") return None for paper_id in tqdm(retrieved_paper_ids): try: paper = AbstractRetrieval(paper_id, view="FULL") except ValueError: print("Impossible to retrieve data for paper \"{}\".".format(paper_id)) return None doc_id = DocumentID() doc_id.parse_scopus(paper) authors = [] if paper.authors: for author in paper.authors: author_affiliations = [] if retrieve_orcid: if author.auid in authors_cache: authors.append(Author(name=author.indexed_name, orcid=authors_cache[author.auid], affiliations=author_affiliations)) else: authors_cache[author.auid] = AuthorRetrieval(author.auid).orcid authors.append(Author(name=author.indexed_name, orcid=authors_cache[author.auid], affiliations=author_affiliations)) else: authors.append(Author(name=author.indexed_name, orcid=None, affiliations=author_affiliations)) if author.affiliation: for affiliation_id in author.affiliation: if affiliation_id in affiliations_cache: affiliation = affiliations_cache[affiliation_id] else: try: affiliation = ContentAffiliationRetrieval(affiliation_id) affiliations_cache[affiliation_id] = affiliation except: affiliation= None if affiliation: author_affiliations.append(Affiliation(name=affiliation.affiliation_name, city=affiliation.city, country=affiliation.country)) references = [] if paper.refcount and int(paper.refcount) > 0 and paper.references: for reference in paper.references: if reference.title: references.append(reference.title) if paper.language: try: language = iso639.languages.get(part2b=paper.language).name except KeyError: language = None else: language = None document = Document(id=doc_id, title=paper.title, keywords=paper.authkeywords, abstract=paper.description, source=paper.publicationName, source_type=paper.aggregationType, language=language, year=int(paper.coverDate.split("-")[0]), authors=authors, references=references, publisher=paper.publisher, internal=paper) if paper.citedby_count: document.citation_count = int(paper.citedby_count) documents.append(document) if docs: return DocumentSet(docs=documents).union(docs) else: return DocumentSet(docs=documents)
name = '{}, {}'.format(CoAuthor.surname, CoAuthor.given_name) print('') print(f'name: {name}') # get an affiliation with an OrgID (starts with a 6) affil_id = CoAuthor.affiliation_current if affil_id[ 0] != '6' and CoAuthor.affiliation_history is not None: affil_id_list = [ a for a in CoAuthor.affiliation_history if a[0] == '6' ] if len(affil_id_list) > 0: affil_id = affil_id_list[0] affiliation = ContentAffiliationRetrieval(affil_id) affil_name = affiliation.affiliation_name print(f'affiliation: {affil_name}') # google them google_results = search(' '.join([ 'email', CoAuthor.given_name, CoAuthor.surname, affiliation.org_domain ]), num=num_url_search_email, only_standard=True, pause=10.) # scrape the results, looking for their email address email_list = [] for count, url in enumerate(google_results):
def main(): # Count affiliations indiv_counts = {} pair_counts = {} totals = pd.Series(dtype="uint64", name="n_obs") print(">>> Counting affiliations from source files year-wise...") years = range(START, END + 1) print_progress(0, len(years)) for i, year in enumerate(years): # Read files by year files = glob(f"{SOURCE_FOLDER}*{year}*.csv") df = pd.concat([read_ma_source_file(f) for f in files]) dup_cols = ["eid", "author"] df = df.drop_duplicates(subset=dup_cols).drop(dup_cols, axis=1) totals.loc[year] = df.shape[0] indiv_counts[year] = Counter( [a for sl in df["affiliations"] for a in sl]) pairs = [combinations(sl, 2) for sl in df["affiliations"]] pair_counts[year] = Counter( [tuple(sorted(p)) for sl in pairs for p in sl]) print_progress(i + 1, len(years)) del df # Write yearly rankings print(">>> Writing yearly rankings...") tops_indiv = select_and_write(indiv_counts) tops_pairs = select_and_write(pair_counts) for aff1, aff2 in tops_pairs: name1 = ContentAffiliationRetrieval(aff1).affiliation_name name2 = ContentAffiliationRetrieval(aff2).affiliation_name print(name1, "--", name2) # Collect data for plotting print(f">>> Plotting {len(tops_indiv)} affiliations") df = pd.DataFrame() all_afids = set() for year, data in indiv_counts.items(): new = pd.DataFrame.from_dict(data, orient="index") all_afids.update(new.index) new["year"] = year df = df.append(new.reindex(tops_indiv)) info = { aff_id: ContentAffiliationRetrieval(aff_id).affiliation_name for aff_id in tops_indiv } df["affiliation"] = pd.Series(info) df = (df.rename(columns={ 0: "occurrence" }).merge(totals, left_on="year", right_index=True)) df["occurrence_norm"] = df["occurrence"] / df["n_obs"] * 100 # Make plot fig, ax = plt.subplots(figsize=(9, 9)) sns.lineplot(x="year", y="occurrence_norm", hue="affiliation", data=df, ax=ax, style=None, palette="colorblind") handles, labels = ax.get_legend_handles_labels() ax.legend(handles=handles[1:], labels=labels[1:]) ylabel = "Share of affiliation's occurrence in multiple "\ "affiliations author-article observations" ax.set(ylabel=ylabel) ax.set_ylim(bottom=0) format_time_axis(ax, df["year"].min(), df["year"].max()) fname = OUTPUT_FOLDER + "Figures/top-affs.pdf" fig.savefig(fname, bbox_inches="tight") plt.close(fig) # Count affiliations by type nonorg_afids = {a for a in all_afids if a.startswith("1")} n_nonorg = len(nonorg_afids) print(f">>> {len(all_afids) - n_nonorg:,} org affiliation IDs") print(f">>> {n_nonorg:,} org affiliation IDs") # Randomly analyze some nonorg affiliation IDs print(">>> Random non-org affiliation names") for aff_id in sample(tuple(nonorg_afids), 100): try: aff = ContentAffiliationRetrieval(aff_id, refresh=20) print(aff.affiliation_name) except Scopus404Error: print("doesn't exist")