def main():
    # Read all researchers
    cols = ["scopus_id", "label"]
    df = pd.read_csv(PERSON_FILE, usecols=cols)[cols]
    df['scopus_id'] = df['scopus_id'].fillna(df['label'])
    df = df.drop_duplicates(subset="scopus_id").set_index("scopus_id")
    df.index = df.index.map(lambda x: str(int(x))
                            if isinstance(x, float) else x)

    # Skip persons that already have estimates
    try:
        collected = pd.read_csv(TARGET_FILE,
                                index_col=0,
                                na_values="",
                                keep_default_na=False)
        collected = collected[collected.index.isin(df.index.tolist())]
        collected = collected.dropna(subset=["gender"])
        df = df.drop(collected.index, errors='ignore')
    except FileNotFoundError:
        collected = pd.DataFrame()

    # Prepare names
    df["first"] = df.apply(get_firstname, axis=1)
    before = df.shape[0]
    df = df.dropna(subset=["first"])
    name_invalid = before - df.shape[0]
    if name_invalid:
        print(f">>> Dropping {name_invalid:,} researchers w/o valid name")

    # Get gender estimates
    estimates = {}
    total = df['first'].nunique()
    print(f">>> Searching for {total} new names...")
    for name in tqdm(df["first"].unique()):
        try:
            resp = genderize.Genderize().get([name])
            estimates[name] = resp[0]
        except Exception as e:  # Daily Quota exceeded
            print("... Quota exceeded, try again tomorrow")
            break

    # Write out
    if estimates:
        new = pd.DataFrame(estimates).T
        new["count"] = new["count"].astype(float)
        df = df.join(new, how="right", on="first")
        df = df[["count", "gender", "name", "probability"]]
        collected = pd.concat([collected, df]).sort_index()
        nans = collected["gender"] == ""
        collected.loc[nans, ["count", "gender", "probability"]] = nan
        collected.to_csv(TARGET_FILE, index_label="ID")

    # Statistics
    print(">>> Distribution of gender:")
    print(pd.value_counts(collected["gender"]))
    n_missing = collected["gender"].isna().sum() + name_invalid
    write_stats({"N_of_researcher_nogender": n_missing})
    share = n_missing / (float(collected.shape[0]) + name_invalid)
    print(f">>> No estimates for {n_missing:,} out of {collected.shape[0]:,} "
          f"({share:,.2%}) researchers w/ valid names")
def main():
    auth = pd.DataFrame(columns=['index', 'centrality'])
    com = pd.DataFrame(columns=['index', 'centrality'])
    global_auth = pd.DataFrame()
    global_com = pd.DataFrame()
    print(">>> Now working on:")
    for file in sorted(glob(NETWORK_FOLDER + "*.gexf")):
        # Read in
        n_id = basename(splitext(file)[0])
        year = n_id[:4]
        print("...", n_id)
        H = nx.read_gexf(file)
        G = giant(H)

        # Clustering of random network
        avg_degree = sum(dict(G.degree()).values())/nx.number_of_nodes(G)
        exp_clustering = avg_degree/nx.number_of_nodes(G)
        print(f"    expected clustering of random network: {exp_clustering:,}")

        # Compute centralities
        new = compute_centralities(H, G)
        for col in ["eigenvector", "betweenness"]:
            new[col + "_rank"] = new[col].rank(method="min", ascending=False)

        # Global measures
        s = global_analysis(H, G)
        rho = spearmanr(new["betweenness"], new["eigenvector"], nan_policy='omit')
        s['rho'] = f"{rho[0]:.2f}{p_to_stars(rho[1])}"

        # Add to DataFrame
        new = (new.reset_index()
                  .melt(id_vars=['index'], var_name='centrality',
                        value_name=year))
        if n_id.endswith('auth'):
            auth = auth.merge(new, "outer", on=['index', 'centrality'])
            global_auth[year] = s
        elif n_id.endswith('com'):
            com = com.merge(new, "outer", on=['index', 'centrality'])
            global_com[year] = s

        # Statistics
        ident = "_".join([n_id[5:], year_name(year, -2)])
        stats = {f"N_of_nodes_{ident}": nx.number_of_nodes(H),
                 f"N_of_nodes_{ident}_giant": nx.number_of_nodes(G)}
        write_stats(stats)

    # WRITE OUT
    t = [('Overall', k) for k in ['Size', 'Links', 'Avg. clustering', 'Components']]
    t.extend([('Giant', k) for k in
             ['Size', 'Density', "Avg. path length", "Diameter", "rho"]])
    networks = [('auth', auth, global_auth), ('com', com, global_com)]
    for label, df1, df2 in networks:
        # Centralities
        df1 = df1.sort_values(['index', 'centrality']).set_index('index')
        fname = f"{TARGET_FOLDER}yearly_centr_{label}.csv"
        df1.to_csv(fname, index_label="node", encoding="utf8")
        # Global statistics
        df2 = df2.T
        df2['Avg. path length'] = df2['Avg. path length'].astype(float).round(2)
        df2.columns = pd.MultiIndex.from_tuples(t)
        fname = f"{OUTPUT_FOLDER}Tables/network_{label}.tex"
        df2.to_latex(fname, multicolumn_format='c', column_format='lrrrr|rrrrr')
Exemplo n.º 3
0
def main():
    # Read acknowledgements
    acks = read_ack_file()
    drops = ['prev', 'misc', 'order', 'ra', 'ind', 'data', 'editor', 'ref']
    acks = acks.drop(drops, axis=1)

    # Count JEL codes
    for col in ['jel', 'jel3']:
        acks[col] = acks[col].apply(clean_jel_codes)
    acks['num_jel'] = (acks['jel'].str.len().fillna(0) +
                       acks['jel3'].str.len().fillna(0))
    acks['num_jel'] = acks['num_jel'].astype("int8")

    # Merge with Scopus
    acks.index = acks['title'].apply(standardize)
    scopus_df = pd.read_csv(SCOPUS_FILE, index_col=0, encoding="utf8")
    df = (scopus_df.drop(["title", "journal", "year"],
                         axis=1).join(acks, how="inner"))
    df.index.name = "simple_title"

    # Count informal collaboration
    ack_cols = ['coms', 'con', 'sem']
    for col in ack_cols:
        df['num_' + col] = df[col].apply(count)
    df["num_coms"] = df["num_coms"].replace(0, np.nan)

    # Mean values of informal collaboration
    count_cols = ["num_" + c for c in ack_cols]
    temp = df[count_cols + ["top"]]
    grouped = temp.groupby("top")[count_cols].agg(["sum", "count"])
    print("Average intensive margin of inf. collab. by journal class:")
    for c in count_cols:
        print(c)
        print(grouped[c]["sum"] / grouped[c]["count"])

    # Set to 0 for papers with acknowledgement
    papers_with = df[count_cols].fillna(0).sum(axis=1) > 0
    df['with'] = papers_with.astype(int)
    df.loc[papers_with, count_cols] = df.loc[papers_with, count_cols].fillna(0)

    # Add metrics for authors and commenters
    df = df.reset_index().set_index(["simple_title", "year"])
    dtypes = {'scopus_id': 'str', 'year': 'uint16'}
    metrics = pd.read_csv(METRICS_FILE, encoding="utf8", dtype=dtypes)
    metrics = metrics.drop(columns=['yearly_pubs', 'yearly_wpubs'], axis=1)
    metrics["cumcites"] = metrics.groupby("scopus_id")["yearly_cites"].cumsum()
    metrics['year'] = metrics['year'] + 1  # Use previous year's values
    auth_metrics = aggregate(df, metrics, "auth")
    coms_metrics = aggregate(df, metrics, "coms")
    del metrics

    # Add centralities for authors and commenters
    files = sorted(glob(CENTR_FOLDER + "*.csv"))
    centr = pd.concat([read_centrality_file(f) for f in files],
                      axis=0,
                      sort=True)
    centr = custom_pivot(centr,
                         id_var='node',
                         var_name='year',
                         unstack_by='centrality')
    centr = centr.rename(columns={"node": "scopus_id"})
    centr['year'] = centr['year'].astype(
        'uint16') + 1  # Previous year's values
    for netw in ("com", "auth"):
        centr[netw + "_giant"] = (~centr[netw + '_eigenvector'].isnull()) * 1
    fill_cols = [c for c in centr if "rank" not in c]
    centr[fill_cols] = centr[fill_cols].fillna(0)
    auth_centr = aggregate(df, centr, "auth")
    coms_centr = aggregate(df, centr, "coms")

    # Combine and fill missings
    df = df.reset_index(level=1)
    df = pd.concat([df, auth_metrics, coms_metrics, auth_centr, coms_centr],
                   axis=1,
                   sort=True)
    for c in df.columns:
        if "giant" in c:
            df[c] = df[c].clip(upper=1)
    fill_cols = list(coms_metrics.columns) + list(coms_centr.columns)
    df.loc[papers_with, fill_cols] = df.loc[papers_with, fill_cols].fillna(0)

    # Write out
    drops = ['title', 'auth', 'coms', 'sem', 'con', 'jel', 'jel3']
    df = df.drop(drops, axis=1)
    df.to_csv(TARGET_FILE, index_label="title")

    # Analyze JEL codes
    acks['jel_cat'] = acks.apply(get_jel_categories, axis=1)
    temp = explode(acks, col="jel_cat")
    dummies = pd.get_dummies(temp.set_index("title"))
    dummies = dummies.groupby(dummies.index).sum()
    print(">>> Distribution of papers to JEL categories:\n",
          dummies.sum(axis=0))
    print(">>> Shares of papers with either G or E:")
    print(
        pd.crosstab(dummies["jel_cat_G"],
                    dummies["jel_cat_E"],
                    margins=True,
                    normalize=True))

    # Statistics
    jel_counter = {v: acks[v].notnull().sum() for v in ('jel', 'jel3')}
    s = {
        'N_of_JEL_all': jel_counter['jel'] + jel_counter['jel3'],
        'N_of_JEL_added': jel_counter["jel3"]
    }
    write_stats(s)
Exemplo n.º 4
0
def main():
    # Read acknowledgements and networks
    acks = read_ack_file()
    G = defaultdict(lambda: nx.Graph())
    for fname in glob(NETWORK_FOLDER + '*.gexf'):
        net_type = splitext(basename(fname))[0][5:]
        H = nx.read_gexf(fname).to_undirected()
        G[net_type] = nx.compose(G[net_type], H)

    # Dictionary mapping authors and their commenters
    auth_com_map = {n: [] for n in G['auth'].nodes()}
    for row in acks.itertuples(index=False):
        for author in row.auth:
            auth_com_map[author].extend(row.coms)

    # Reciprocity among coauthors
    acks['r_auth'] = acks[['auth', 'coms']].apply(
        lambda s: realized_reciprocity(s, ('auth', 'coms'), G=G['auth']), axis=1)
    acks['r_auth_p'] = acks['coms'].apply(
        lambda s: potential_reciprocity(s, G['auth']))

    # Reciprocity among commenters
    acks['r_com'] = acks[['auth', 'coms']].apply(
        lambda s: realized_reciprocity(s, mapping=auth_com_map), axis=1)
    acks['r_com_p'] = acks[['auth', 'coms']].apply(
        lambda s: potential_reciprocity(s, G['auth']), axis=1)

    # Read affiliation information
    affs = read_affiliations()

    # Reciprocity among colleagues
    acks = acks.set_index("title")
    auth_aff = group_affiliations("auth", acks, affs, year_correction=1)
    com_aff = group_affiliations("coms", acks, affs, year_correction=1)
    both_aff = pd.concat([auth_aff, com_aff], axis=1).reset_index()
    both_aff = both_aff.dropna(subset=["coms"])
    both_aff["auth"] = both_aff["auth"].str.replace("-", "|")
    for c in ("auth", "coms"):
        both_aff[c] = both_aff[c].str.strip("|").str.strip("-").str.split("|")
    both_aff["com_coll"] = both_aff.apply(count_coll_com, axis=1)
    both_aff = both_aff.dropna(subset=["com_coll"])
    print(">>> Distribution of the number of commenters that are colleagues")
    print(both_aff["com_coll"].value_counts()/both_aff.shape[0])
    acks = acks.merge(both_aff.drop(["year", "auth", "coms"], axis=1),
                      "left", on="title")
    acks["r_coll"] = acks["com_coll"].fillna(0) > 0
    acks["r_coll_p"] = acks["com_coll"].notnull()

    # Statistics
    stats = {'reci_auth_real': acks['r_auth'].sum(),
             'reci_auth_pot': acks['r_auth_p'].sum(),
             'reci_com_real': acks['r_com'].sum(),
             'reci_com_pot': acks['r_com_p'].sum(),
             'reci_coll_real': acks['r_coll'].sum(),
             'reci_coll_pot': acks['r_coll_p'].sum(),
             'reci_any_real': sum(acks[['r_auth', 'r_com', 'r_coll']].any(axis=1)),
             'reci_any_pot': sum(acks[['r_auth_p', 'r_com_p', 'r_coll_p']].any(axis=1))}
    write_stats(stats)

    print(">>> Papers with commenting co-authors: "
          f"{stats['reci_auth_real']:,} (of {stats['reci_auth_pot']:,})")
    print(">>> Papers with authors commenting on their commenters' "
          f"work: {stats['reci_com_real']:,} (of {stats['reci_com_pot']:,})")
    print(">>> Papers with commenters that are colleagues: "
          f"{stats['reci_coll_real']:,} (of {stats['reci_coll_pot']:,})")
    print(">>> Papers with either form of reciprocity: "
          f"{stats['reci_any_real']:,} (of {stats['reci_any_pot']:,})")
Exemplo n.º 5
0
def make_barchart(df):
    """Plot a horizontal stacked bar showing the number of pure authors,
    commenting authors and pure commenters.
    """
    # Count
    authors = set()
    for f in glob(NETWORKS_FOLDER + "*auth.gexf"):
        authors.update(nx.read_gexf(f).nodes())
    commenters = set()
    for f in glob(NETWORKS_FOLDER + "*com.gexf"):
        commenters.update(nx.read_gexf(f).nodes())
    # Prepare
    df['scopus_id'] = df['scopus_id'].astype(str)
    pure_com = (commenters - authors)
    pure_auth = set(df[df['com_out_degree'].fillna(0) == 0]['scopus_id'].unique())
    com_auth = (commenters - pure_auth - pure_com)
    print(f">>> {len(pure_auth):,} pure authors "
          f"({sum(x.isdigit() for x in pure_auth):,} w/ Scopus ID); "
          f"{len(pure_com):,} pure commenters "
          f"({sum(x.isdigit() for x in pure_com):,} w/ Scopus ID); "
          f"{len(com_auth):,} mixed types "
          f"({sum(x.isdigit() for x in com_auth):,} w/ Scopus ID)")
    out = pd.DataFrame(data=[len(pure_auth), len(com_auth), len(pure_com)],
                       index=['pure_auth', 'com_auth', 'pure_com'],
                       columns=['persons'])
    # Plot
    fig, ax = plt.subplots(figsize=(25, 4))
    out.T.plot(kind='barh', stacked=True, legend=False, ax=ax, colormap='PiYG',
               alpha=0.7)
    ax.axes.get_xaxis().set_visible(False)
    ax.axes.get_yaxis().set_visible(False)
    # Hatched area for commenting authors
    ax.patches[1].set(hatch="O", facecolor=ax.patches[0].get_facecolor(),
                      edgecolor=ax.patches[2].get_facecolor(), lw=0)
    # Add labels inside
    for p in ax.patches:
        ax.annotate(f"{int(p.get_width()):,}", fontsize=40,
                    xy=(p.get_x() + p.get_width()/3.1, -0.05))
    # Add bracket outside (set width manually)
    auth_cent = (len(authors)/out.sum())/2 - 0.01
    bbox = {"boxstyle": 'square', "fc": 'white'}
    arrowprops = {"arrowstyle": '-[, widthB=8.35, lengthB=1',
                  "lw": 2.0, "color": "black"}
    ax.annotate('Authors', xy=(auth_cent, 0.90), xytext=(auth_cent, 0.9),
                xycoords='axes fraction', ha='center', va='bottom',
                bbox=bbox, fontsize=35, arrowprops=arrowprops)
    com_cent = (len(commenters)/out.sum())/2 + auth_cent - 0.054
    arrowprops.update({"arrowstyle": '-[, widthB=12.73, lengthB=1'})
    ax.annotate('Commenters', xy=(com_cent, 0.10), xytext=(com_cent, 0),
                xycoords='axes fraction', ha='center', va='bottom',
                bbox=bbox, fontsize=35, arrowprops=arrowprops)
    # Save
    sns.despine(ax=None, top=True, right=True, left=True, bottom=True)
    fname = OUTPUT_FOLDER + "Figures/barh_persons.pdf"
    fig.savefig(fname, bbox_inches="tight")
    fname = OUTPUT_FOLDER + "Figures/barh_persons.png"
    fig.savefig(fname, bbox_inches="tight")
    plt.clf()
    # Write stats
    stats = {'N_of_Authors_pure': len(pure_auth),
             'N_of_Commenters_pure': len(pure_com),
             'N_of_Authors_commenting': len(com_auth)}
    write_stats(stats)