def get_country(aff_id, refresh=350):
    """Get country of an affiliation."""
    try:
        country = _aff_countries[aff_id]
    except KeyError:
        try:
            aff = ContentAffiliationRetrieval(aff_id, refresh=refresh)
            country = aff.country or "Unknown"
        except ScopusException:
            country = "Unknown"
        if aff_id.startswith("6") and country == "Unknown":
            _aff_missing_countries.add(aff_id)
        country = _country_map.get(country, country)
        _aff_countries[aff_id] = country
    return country
def get_type(aff_ids, refresh=350):
    """Return types of affiliations recorded by Scopus."""
    out = []
    for aff_id in aff_ids:
        # Use parsed information or load new information
        try:
            aff_type = _aff_types[aff_id]
        except KeyError:
            if aff_id.startswith("1"):
                aff_type = "?"
            else:
                try:
                    aff = ContentAffiliationRetrieval(aff_id, refresh=refresh)
                    aff_type = aff.org_type.split("|")[0]
                    aff_type = _aff_map.get(aff_type, aff_type)
                except (AttributeError, ScopusException):
                    aff_type = "?"
            _aff_types[aff_id] = aff_type
        out.append(aff_type)
    return tuple(sorted(out, reverse=True))
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Tests for `scopus.ContentAffiliationRetrieval` module."""

from nose.tools import assert_equal, assert_true

from pybliometrics.scopus import ContentAffiliationRetrieval

aff = ContentAffiliationRetrieval('60000356', refresh=True)


def test_address():
    assert_equal(aff.address, 'Private Bag X3')


def test_affiliation_name():
    assert_equal(aff.affiliation_name, 'University of Cape Town')


def test_author_count():
    expected = '10951'
    assert_true(aff.author_count >= expected)


def test_city():
    assert_equal(aff.city, 'Cape Town')


def test_country():
    assert_equal(aff.country, 'South Africa')
def search_scopus(query, docs=None, retrieve_orcid=True):
    """Search Scopus."""

    documents = []
    authors_cache = {}
    affiliations_cache = {}
    try:
        retrieved_paper_ids = ScopusSearch(query, view="STANDARD").get_eids()
    except ScopusQueryError:
        print("Impossible to process query \"{}\".".format(query))
        return None
    if len(retrieved_paper_ids) == 0:
        print("No matching documents for the provided query.")
        return None
    for paper_id in tqdm(retrieved_paper_ids):
        try:
            paper = AbstractRetrieval(paper_id, view="FULL")
        except ValueError:
            print("Impossible to retrieve data for paper \"{}\".".format(paper_id))
            return None
        doc_id = DocumentID()
        doc_id.parse_scopus(paper)
        authors = []
        if paper.authors:
            for author in paper.authors:
                author_affiliations = []
                if retrieve_orcid:
                    if author.auid in authors_cache:
                        authors.append(Author(name=author.indexed_name,
                                              orcid=authors_cache[author.auid],
                                              affiliations=author_affiliations))
                    else:
                        authors_cache[author.auid] = AuthorRetrieval(author.auid).orcid
                        authors.append(Author(name=author.indexed_name,
                                              orcid=authors_cache[author.auid],
                                              affiliations=author_affiliations))
                else:
                    authors.append(Author(name=author.indexed_name,
                                          orcid=None,
                                          affiliations=author_affiliations))
                if author.affiliation:
                    for affiliation_id in author.affiliation:
                        if affiliation_id in affiliations_cache:
                            affiliation = affiliations_cache[affiliation_id]
                        else:
                            try:
                                affiliation = ContentAffiliationRetrieval(affiliation_id)
                                affiliations_cache[affiliation_id] = affiliation
                            except:
                                affiliation= None
                        if affiliation:
                            author_affiliations.append(Affiliation(name=affiliation.affiliation_name,
                                                               city=affiliation.city,
                                                               country=affiliation.country))
        references = []
        if paper.refcount and int(paper.refcount) > 0 and paper.references:
            for reference in paper.references:
                if reference.title:
                    references.append(reference.title)
        if paper.language:
            try:
                language = iso639.languages.get(part2b=paper.language).name
            except KeyError:
                language = None
        else:
            language = None

        document = Document(id=doc_id,
                            title=paper.title,
                            keywords=paper.authkeywords,
                            abstract=paper.description,
                            source=paper.publicationName,
                            source_type=paper.aggregationType,
                            language=language,
                            year=int(paper.coverDate.split("-")[0]),
                            authors=authors,
                            references=references,
                            publisher=paper.publisher,
                            internal=paper)
        if paper.citedby_count:
            document.citation_count = int(paper.citedby_count)
        documents.append(document)
    if docs:
        return DocumentSet(docs=documents).union(docs)
    else:
        return DocumentSet(docs=documents)
예제 #5
0
                    name = '{}, {}'.format(CoAuthor.surname,
                                           CoAuthor.given_name)
                    print('')
                    print(f'name:        {name}')

                    # get an affiliation with an OrgID (starts with a 6)
                    affil_id = CoAuthor.affiliation_current
                    if affil_id[
                            0] != '6' and CoAuthor.affiliation_history is not None:
                        affil_id_list = [
                            a for a in CoAuthor.affiliation_history
                            if a[0] == '6'
                        ]
                        if len(affil_id_list) > 0:
                            affil_id = affil_id_list[0]
                    affiliation = ContentAffiliationRetrieval(affil_id)
                    affil_name = affiliation.affiliation_name
                    print(f'affiliation: {affil_name}')

                    # google them
                    google_results = search(' '.join([
                        'email', CoAuthor.given_name, CoAuthor.surname,
                        affiliation.org_domain
                    ]),
                                            num=num_url_search_email,
                                            only_standard=True,
                                            pause=10.)

                    # scrape the results, looking for their email address
                    email_list = []
                    for count, url in enumerate(google_results):
def main():
    # Count affiliations
    indiv_counts = {}
    pair_counts = {}
    totals = pd.Series(dtype="uint64", name="n_obs")
    print(">>> Counting affiliations from source files year-wise...")
    years = range(START, END + 1)
    print_progress(0, len(years))
    for i, year in enumerate(years):
        # Read files by year
        files = glob(f"{SOURCE_FOLDER}*{year}*.csv")
        df = pd.concat([read_ma_source_file(f) for f in files])
        dup_cols = ["eid", "author"]
        df = df.drop_duplicates(subset=dup_cols).drop(dup_cols, axis=1)
        totals.loc[year] = df.shape[0]
        indiv_counts[year] = Counter(
            [a for sl in df["affiliations"] for a in sl])
        pairs = [combinations(sl, 2) for sl in df["affiliations"]]
        pair_counts[year] = Counter(
            [tuple(sorted(p)) for sl in pairs for p in sl])
        print_progress(i + 1, len(years))
        del df

    # Write yearly rankings
    print(">>> Writing yearly rankings...")
    tops_indiv = select_and_write(indiv_counts)
    tops_pairs = select_and_write(pair_counts)
    for aff1, aff2 in tops_pairs:
        name1 = ContentAffiliationRetrieval(aff1).affiliation_name
        name2 = ContentAffiliationRetrieval(aff2).affiliation_name
        print(name1, "--", name2)

    # Collect data for plotting
    print(f">>> Plotting {len(tops_indiv)} affiliations")
    df = pd.DataFrame()
    all_afids = set()
    for year, data in indiv_counts.items():
        new = pd.DataFrame.from_dict(data, orient="index")
        all_afids.update(new.index)
        new["year"] = year
        df = df.append(new.reindex(tops_indiv))
    info = {
        aff_id: ContentAffiliationRetrieval(aff_id).affiliation_name
        for aff_id in tops_indiv
    }
    df["affiliation"] = pd.Series(info)
    df = (df.rename(columns={
        0: "occurrence"
    }).merge(totals, left_on="year", right_index=True))
    df["occurrence_norm"] = df["occurrence"] / df["n_obs"] * 100

    # Make plot
    fig, ax = plt.subplots(figsize=(9, 9))
    sns.lineplot(x="year",
                 y="occurrence_norm",
                 hue="affiliation",
                 data=df,
                 ax=ax,
                 style=None,
                 palette="colorblind")
    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles=handles[1:], labels=labels[1:])
    ylabel = "Share of affiliation's occurrence in multiple "\
             "affiliations author-article observations"
    ax.set(ylabel=ylabel)
    ax.set_ylim(bottom=0)
    format_time_axis(ax, df["year"].min(), df["year"].max())
    fname = OUTPUT_FOLDER + "Figures/top-affs.pdf"
    fig.savefig(fname, bbox_inches="tight")
    plt.close(fig)

    # Count affiliations by type
    nonorg_afids = {a for a in all_afids if a.startswith("1")}
    n_nonorg = len(nonorg_afids)
    print(f">>> {len(all_afids) - n_nonorg:,} org affiliation IDs")
    print(f">>> {n_nonorg:,} org affiliation IDs")

    # Randomly analyze some nonorg affiliation IDs
    print(">>> Random non-org affiliation names")
    for aff_id in sample(tuple(nonorg_afids), 100):
        try:
            aff = ContentAffiliationRetrieval(aff_id, refresh=20)
            print(aff.affiliation_name)
        except Scopus404Error:
            print("doesn't exist")