def generate():

    # search init
    s3 = sssearch.SSSearch(
        # data dir path
        u_data.semantic_scholar_dir,
        {  # thresholds
            "author": 50,
            "title": 50,
            "year": 2
        })

    # graph init
    graph = NET("network_citation_net")

    # TODO: color attribute for each paper
    # TODO: print statisics:

    # iterate through papers
    conf_filenames = [fn for fn in u_data.getConferenceFilenames()]
    conf_filenames = conf_filenames[:10]

    print("------------------------------------------------")
    print("[#] Analyzing Data:")
    for conf_filename in tqdm(conf_filenames):
        conf_name = conf_filename.replace(".json", "")
        data = u_data.getPapers(conf_filename)
        papers = data['papers']
        for paper in papers:
            bibtex = p_features.getCitationsData(key)
            s3entry, _ = s3.query("2017", None, paper["title"])
            # Dict organization
            # paper   : {key, title, authors}
            # bibtex  : {author, journal, title, year}
            # s3entry : {id, authors, title, abstract, journalName}

            # make node for this paper
            graph.addNode(s3entry["id"], {})  # TODO: node attributes

            # make edges to papers that this paper cites
            e_id = key
            uid_suffix = 0
            for i in range(bibtex["size"]):
                s3cit, _ = s3.query(bibtex["year"], bibtex["author"],
                                    bibtex["title"])
                graph.addNode(s3cit["id"], {})  # TODO: node attributes
                graph.addEdge(e_id + str(uid_suffix), s3entry["id"],
                              s3cit["id"])
                uid_suffix += 1

    print("------------------------------------------------")
    print("[%] Writing file:")
    graph.write(u_data.systems_papers_directory + "net/")
Пример #2
0
def getAllAuthorsAttribute(attr):
    conf_filenames = u_data.getConferenceFilenames()
    values = {}
    for conf_filename in conf_filenames:
        authors = u_data.getAuthors(conf_filename)
        for a_name, a_data in authors.items():
            if isinstance(a_data, str) or not attr in a_data: continue
            val = a_data[attr]
            # update existing entry
            if a_name in values:
                values[a_name] = max(values[a_name], val)
                # add entry
            else:
                values[a_name] = val
    return values
import semantic_scholar.s2data as s2data
import utils.data as u_data
from tqdm import tqdm

fn_grepciters = u_data.systems_papers_directory+"script/grepciters.sh"

# script
grepciters = open(fn_grepciters,"w+")
# remvove old data file
grepciters.write("rm "+s2data.citers_fn+"\n")

# loop through titles of papers
cfns = [ fn for fn in u_data.getConferenceFilenames() ]
i = 0
l = len(cfns)-1
for cfn in tqdm(cfns):
    cname = cfn.replace(".json","")
    data = u_data.getPapers(cfn)
    papers = data["papers"]
    for p in papers:
        title = p["title"].replace('"','\\"')
<<<<<<< HEAD
        grepciters.write('grep -h -m 1 "'+title +'" '+u_data.semantic_scholar_dir+'*.json >> citers-'+str(i)+'.json'\n')
=======
        grepciters.write('grep -h "' + title + '"' +
            u_data.semantic_scholar_dir + '*.json >>' + # search raw data
            s2data.citers_fn + '\n' # put results into citers.json
        # ../semantic-scholar/*.json >> citers.json\n') # this was old approach

>>>>>>> 28cb7a055fd557791704ce2ee5e52e8ef23f47d2
        grepciters.write('echo "['+str(i)+'/'+str(l)+'] '+title+'"\n')
Пример #4
0
def generate(color_attribute):

    # graph
    graph = NET("collaboration_net" + "_color=" + color_attribute)

    # color attribute for each author
    color_attribute_dict = a_features.getAllAuthorsAttribute(color_attribute)
    color_attribute_dict_values = [
        hind for hind in color_attribute_dict.values()
    ]
    color_attribute_min = min(color_attribute_dict_values)
    color_attribute_max = max(color_attribute_dict_values)
    with open(directory + graph.name + "_attributes.txt", "w+") as file:
        file.write("name hindex\n")
        for a_name, hind in color_attribute_dict.items():
            file.write("\"" + a_name + "\"" + " " + str(hind) + "\n")
    # print statisics:
    print("------------------------------------------------")
    print(color_attribute + " statistics:")
    print(" - min    :", color_attribute_min)
    print(" - max    :", color_attribute_max)
    print(" - mean   :", np.mean(color_attribute_dict_values))
    print(" - median :", np.median(color_attribute_dict_values))
    print(" - std    :", np.std(color_attribute_dict_values))

    def colorAttributeToColor(val):
        norm = (val - color_attribute_min) / color_attribute_max
        return u_colors.RGBToHexColor(norm, 0.0, 1 - norm)

    # iterate through papers
    conf_filenames = [fn for fn in u_data.getConferenceFilenames()]

    print("------------------------------------------------")
    features = a_features.getAllAuthorFeatures()
    print("[#] Analyzing Data:")
    for conf_filename in tqdm(conf_filenames):
        data = u_data.getPapers(conf_filename)
        papers = data['papers']
        for paper in papers:
            # paper data
            uid_suffix = 0  # make same-paper edges unique
            paper_key = paper['key']
            author_names = [utils.author_name(a)[0] for a in paper['authors']]
            author_unames = [
                a_features.getAuthorUName(a, features)
                for a in paper['authors']
            ]

            # iterate through collaborations
            indecies = [i for i in range(len((author_names)))]
            for (i1, i2) in u_combos.pairs_unordered(indecies):
                vals = [0]  # for calculating max val in pair
                aus = [author_unames[i1], author_unames[i2]]
                ans = [author_names[i1], author_names[i2]]
                for i in range(2):
                    au, an = aus[i], ans[i]
                    # attributes
                    a_attrs = {}
                    # color_attribute
                    if an in color_attribute_dict:
                        val = color_attribute_dict[an]
                        vals.append(val)
                        a_attrs[color_attribute] = val
                        # a_attrs["color"] = colorAttributeToColor(val)
                    # add node to graph
                    graph.addNode(au, a_attrs)
                # add edge to graph
                e_attrs = {"color": colorAttributeToColor(max(vals))}
                weight = 1
                graph.addEdge(paper_key + "_" + str(uid_suffix), aus[0],
                              aus[1], weight, e_attrs)
                uid_suffix += 1

    print("------------------------------------------------")
    print("[%] Writing file:")
    graph.write(directory)
import utils.data as data
import json

papers = []

cfns = data.getConferenceFilenames()
for cfn in cfns:
    papers += data.getPapers(cfn)["papers"]

json.dump(papers, open(data.conf_directory+"allConferencePapers.json","w+"))