示例#1
0
def read_process_data():
    papers = read_papers()
    topic_mix = read_topic_mix()
    topic_category_map = read_topic_category_map()
    arxiv_cat_lookup = read_arxiv_cat_lookup()

    return papers, topic_mix, topic_category_map, arxiv_cat_lookup
def read_process_data():
    papers = read_papers()
    paper_orgs = paper_orgs_processing(read_papers_orgs(), papers)
    paper_orgs["year"] = [x.year for x in paper_orgs["date"]]

    topic_mix = read_topic_mix()
    topic_mix.set_index("article_id", inplace=True)

    return papers, paper_orgs, topic_mix
def read_process_data():
    papers = read_papers()
    papers_orgs = paper_orgs_processing(read_papers_orgs(), papers)
    topic_mix = read_topic_mix()
    topic_mix.set_index("article_id", inplace=True)
    vectors = read_vectors().pivot_table(index="article_id",
                                         columns="dimension",
                                         values="value")

    return papers, papers_orgs, topic_mix, vectors
示例#4
0
def read_process_data():
    papers = read_papers()
    topic_mix = (
        remove_zero_axis(  # We remove a couple of papers with zero in all topics
            read_topic_mix().set_index("article_id")))

    logging.info("Process dfs")
    papers["year"] = [x.year for x in papers["date"]]

    return papers, topic_mix
def read_process_data():
    """Reads and processes the data"""
    arxiv_cat_lookup = read_arxiv_cat_lookup()
    papers = read_papers()
    topic_long = read_topic_long()
    topic_mix = read_topic_mix()
    cats = read_arxiv_categories()

    # Paper cats
    cat_sets = cats.groupby(["category_id"])["article_id"].apply(lambda x: set(x))

    # create a unique cat_sets
    one_cat_ps = cats.groupby("article_id")["category_id"].apply(lambda x: len(x))
    one_cat_ids = set(one_cat_ps.loc[one_cat_ps == 1].index)

    return papers, topic_mix, topic_long, cats, cat_sets, one_cat_ids, arxiv_cat_lookup
def read_process_data():
    papers = read_papers()
    paper_orgs = paper_orgs_processing(read_papers_orgs(), papers)
    topic_mix = read_topic_mix()
    topic_mix.set_index("article_id", inplace=True)

    # topic_long = read_topic_long()
    topic_category_map = read_topic_category_map()
    arxiv_cat_lookup = read_arxiv_cat_lookup()
    topic_list = topic_mix.columns

    return (
        papers,
        paper_orgs,
        topic_mix,
        topic_category_map,
        arxiv_cat_lookup,
        topic_list,
    )
def read_process_data():
    papers = read_papers()
    papers_orgs = paper_orgs_processing(read_papers_orgs(), papers)
    # Date
    papers_orgs["year"] = [x.year for x in papers_orgs["date"]]

    # Org diversity df

    org_div_df = f"{project_dir}/data/processed/org_diversity.csv"
    if os.path.exists(org_div_df) is False:
        logging.info("Making organisational diversity")
        make_org_diversity()
        org_diversity = pd.read_csv(
            f"{project_dir}/data/processed/org_diversity.csv")

    else:
        logging.info("Reading organisational diversity")
        org_diversity = pd.read_csv(
            f"{project_dir}/data/processed/org_diversity.csv")

    return papers, papers_orgs, org_diversity
示例#8
0
def load_process_data():
    """Loads AI paper data for analysis in section 1."""
    logging.info("Reading data")

    arxiv_cat_lookup = read_arxiv_cat_lookup()
    papers = read_papers()
    topic_long = read_topic_long()
    topic_mix = read_topic_mix()
    cats = read_arxiv_categories()

    logging.info("Reading tokenised abstracts")
    with open(f"{project_dir}/data/interim/arxiv_tokenised.json", "r") as infile:
        arxiv_tokenised = json.load(infile)

    logging.info("Reading AI labelling outputs")
    with open(f"{project_dir}/data/interim/find_ai_outputs.p", "rb") as infile:
        ai_indices, term_counts = pickle.load(infile)

    logging.info("Processing")
    papers["tokenised"] = papers["article_id"].map(arxiv_tokenised)

    # Create category sets to identify papers in different categories
    ai_cats = ["cs.AI", "cs.NE", "stat.ML", "cs.LG"]
    cat_sets = cats.groupby("category_id")["article_id"].apply(lambda x: set(x))

    # Create one hot encodings for AI categories
    ai_binary = pd.DataFrame(index=set(cats["article_id"]), columns=ai_cats)

    for c in ai_binary.columns:
        ai_binary[c] = [x in cat_sets[c] for x in ai_binary.index]

    # Create arxiv dataset
    papers.set_index("article_id", inplace=True)

    # We remove papers without abstracts and arXiv categories
    arx = pd.concat([ai_binary, papers], axis=1, sort=True).dropna(
        axis=0, subset=["abstract", "cs.AI"]
    )

    return arx, ai_indices, term_counts, arxiv_cat_lookup, cat_sets, cats, ai_cats
def read_process_data():

    papers = read_papers()
    paper_orgs = read_papers_orgs()

    return papers, paper_orgs
示例#10
0
from narrowing_ai_research import project_dir
import statsmodels.api as sm
from statsmodels.api import add_constant
from sklearn.decomposition import PCA
import altair as alt

from narrowing_ai_research.utils.altair_utils import altair_visualisation_setup, save_altair
# -

webd = altair_visualisation_setup()

# ### Read data

# +
papers = (read_papers(
    keep_vars=['article_id', 'year', 'date', 'is_ai', 'citation_count']).query(
        "is_ai == True").reset_index(drop=True))

porgs = read_papers_orgs()

orgs = (paper_orgs_processing(
    porgs, papers).query("is_ai==True").reset_index(drop=True))

tm = read_topic_mix()
# -

# ### Create analytical table

# +
# AI papers with private orgs