예제 #1
0
def get_s2_conferences():
  # all_conf_papers : [(conf_name, conf_papers)]
  all_conf_papers = list(get_each_conference_papers())
  papers = s2data.get_dict_gA()

  def match_paper_title_with_conf(paper_title):
    for conf_name, conf_papers in all_conf_papers:
      for paper in conf_papers:
        # print(conf_name,":",paper["title"])
        if paper_title.lower() == paper["title"].lower():
          return conf_name

  s2_paper_confs = {} # paper_id => conf_name
  # iterate through all s2 papers
  # papers = { k:v for k,v in list(papers.items())[1:10] }
  for paper_id, paper in papers.items():
    conf_name = match_paper_title_with_conf(paper["title"])
    if conf_name:
      s2_paper_confs[paper_id] = conf_name
      # print("[*]", paper["title"])
    else:
      pass
      # print("[!!!]", paper["title"])

  return s2_paper_confs
예제 #2
0
def get_s2id_to_key():
  """
  HAVE
    raw_papers  :: [ raw_paper = (title, key) ]
    s2id_to_paper :: s2id => paper
    bad_to_good :: bad_title => good_title
  """

  conf_papers = list(get_each_conference_papers())
  raw_papers  = sum([ps for (conf_name, ps) in conf_papers], [])

  s2id_to_paper = s2data.get_dict_gA()
  bad_to_good = missing_dicts.get_bad_to_good()

  """
  USE
    good_to_s2id  :: good_title => s2id
    title_to_key :: title => key
  """

  good_to_s2id = { p["title"] : s2id for s2id, p in s2id_to_paper.items() }

  title_to_id = {}
  # fill good titles
  for good, s2id in good_to_s2id.items():
    good = good.lower()
    title_to_id[good] = s2id
  # fill bad titles
  for bad, good in bad_to_good.items():
    bad = bad.lower()
    good = good.lower()
    title_to_id[bad] = title_to_id[good]
    # del title_to_id[good]

  # title_to_key :: title => key
  title_to_key = { rp["title"].lower() : rp["key"] for rp in raw_papers }

  """
  GOAL
    s2id_to_key :: s2id => paper_key
  """

  # s2id_to_key :: s2id => paper_key
  s2id_to_key = {}
  missing = 0
  for title, key in tqdm(title_to_key.items()):
    title = title.lower()
    if not title in title_to_id:
      missing += 1
      continue
    s2id = title_to_id[title]
    s2id_to_key[s2id] = key
  print("missing:", missing)

  return s2id_to_key
예제 #3
0
def generate():

    ################################################################
    print("[#] Initializing GEXF")
    
    # graph init
    graph = GEXF("citations_papers")
    # parameters
    graph.setParameter("graph", "defaultedgetype", "directed")
    # attributes
    graph.addAttribute( "node", "conference" , "string", "" )
    graph.addAttribute( "node", "title"      , "string", "" )
    graph.addAttribute( "node", "year"      , "string", "" )


    # TODO: color attribute for each paper
    # TODO: print statisics:

    ################################################################
    print("[#] Loading Data:")
    
    gA = s2data.get_dict_gA()

    ################################################################
    print("[#] Analyzing Data:")

    def safeindex(d,k):
        return d[k] if k in d else "MISSING"
    
    for id, paper in gA.items():
        graph.addNode(
            id, {
            "title"      : safeindex(paper,"title"),
            "conference" : safeindex(paper,"venue"),
            "year"       : str(safeindex(paper,"year"))
        })
        for out_id in paper["outCitations"]:
            graph.addEdge(safeindex(paper,"title"), id, out_id, 1)

    ################################################################
    print("[#] Writing file:")

    graph.write("/home/blancheh/SystemsAnalysis/systems-papers/gexf/")
예제 #4
0
import utils.combinatorics as u_combos
import utils.debug as debug
from tqdm import tqdm
import networkx as nx

# modules
import utils.data as u_data
import authors.author_features as a_features
import semantic_scholar.s2data as s2data
from papers_network.papers_network import PapersNetwork

################################################################################
# Load Data
debug.message("Loading Data")

papers = s2data.get_dict_gA()
print()

################################################################################
# Initialization
debug.message("Creating Papers Network")

G = PapersNetwork()
G.add_papers(papers)
G.fill_graph()

if True:
    debug.message("Analyzing Network Statistics")
    G.save_adjacency_matrix_csv()
    quit()
예제 #5
0
def generate():

    ################################################################
    print("[#] Initializing GEXF")

    # graph init
    graph = GEXF("citations_conferences")
    # parameters
    graph.setParameter("graph", "defaultedgetype", "directed")
    # attributes
    # graph.addAttribute( "node", "conference" , "string", "" )
    # graph.addAttribute( "node", "title"      , "string", "" )
    # graph.addAttribute( "node", "year"      , "string", "" )

    # TODO: color attribute for each paper
    # TODO: print statisics:

    ################################################################
    print("[#] Loading Data:")

    gA = s2data.get_dict_gA()
    gB = s2data.get_dict_gB()

    ################################################################
    print("[#] Analyzing Data:")

    conferences = []
    edge_id = 0
    missing_count = 0

    def addNode_safe(conf):
        if not conf in conferences:
            graph.addNode(conf, {})
            conferences.append(conf)

    for source_id, source_paper in gA.items():

        # source node
        source_conf = conf_utils.normalize_conference(source_paper["venue"])
        if len(source_conf) == 0: continue
        addNode_safe(source_conf)

        # for each outcite
        for target_id in source_paper["outCitations"]:

            if not target_id in gB:
                missing_count += 1
                continue

            # target node
            target_conf = conf_utils.normalize_conference(
                gB[target_id]["venue"])
            if len(target_conf) == 0: continue
            addNode_safe(target_conf)

            # edge
            graph.addEdge(str(edge_id), source_conf, target_conf)
            edge_id += 1

    print("[>] missing count:", missing_count)

    ################################################################
    print("[#] Writing file:")

    graph.write("/home/blancheh/SystemsAnalysis/systems-papers/gexf/")
예제 #6
0
import re
import math
import json

#
# Goal:
#   find the citers that weren't grepped into `citersdict`
#

# DEPRECATED
# compile all of the s2 corpus files
# together into one big megafile, so its
# faster to grep it for titles
#

gA_dict = s2data.get_dict_gA()
gA_string = str(gA_dict)

#
# search for all the papers
#

missing = {}

# loop through titles of papers
cfns = [ fn for fn in u_data.getConferenceFilenames() ]
last_i = len(cfns) - 1

# start index
i = 0
import utils.strings as util_str
import conferences.conferences as conf_data
import semantic_scholar.s2data as s2_data
import utils.json as u_json
from tqdm import tqdm

DATA_DIR = "find_missing/data/"
MISSING_RAWTITLES_FN = DATA_DIR + "missing_rawtitles.txt"
RAWTITLE_TO_S2TITLE_FN = DATA_DIR + "rawtitle_to_s2title.json"
RAWTITLE_TO_S2ID_FN = DATA_DIR + "rawtitle_to_s2id.json"

conferences = list(conf_data.get_each_conference_papers())
rawpapers = sum([conf_papers for conf_name, conf_papers in conferences], [])

known_papers = s2_data.get_dict_gA().values()

match_threshold = 5


def is_match(s1, s2):
    return s1.lower() == s2.lower()


rawtitle_to_s2title = {}
rawtitle_to_s2id = {}


def is_known(rawpaper):
    rawtitle = rawpaper["title"]
    for paper in known_papers:
        known_title = paper["title"]
# modules
from gexf.gexf import GEXF
import utils.data as u_data
import semantic_scholar.s2data as s2data
import chord.chord as chord
import chord.chord_colors as chord_colors

################################################################
# parameters

threshold = 150

################################################################
print("[#] Loading Data:")

gA = s2data.get_dict_gA()
gB = s2data.get_dict_gB()

################################################################
print("[#] Analyzing Data:")

# { conference : { conference: #citations } }
conferences = {}


def inc_conf(source, target):
    if not source in conferences:
        conferences[source] = {}
    if not target in conferences[source]:
        conferences[source][target] = 0
    conferences[source][target] += 1
예제 #9
0
import utils.strings as util_str
import conferences.conferences as conf_data
import semantic_scholar.s2data as s2_data
import json
from tqdm import tqdm

id_to_paper = s2_data.get_dict_gA()
id_to_title = {}
for p_id, p in id_to_paper.items():
    id_to_title[p_id] = p["title"]

PARENT_DIR = "find_missing/"
BAD_TITLE_TO_ID_FN = PARENT_DIR + "bad_title_to_id.json"
PARTIAL_TO_GOOD_FN = PARENT_DIR + "partial_to_good.json"
BAD_TO_GOOD_FN = PARENT_DIR + "bad_to_good.json"


def load_json(fn):
    with open(fn, "r+") as f:
        return json.load(f)


def save_json(fn, obj):
    with open(fn, "w+") as f:
        json.dump(obj, f, indent=4)


bad_title_to_id = load_json(BAD_TITLE_TO_ID_FN)
partial_to_good = load_json(PARTIAL_TO_GOOD_FN)

# get good titles for bad titles
import utils.strings as util_str
import conferences.conferences as conf_data
import semantic_scholar.s2data as s2_data
import utils.json as u_json
from tqdm import tqdm

DATA_DIR = "find_missing/data/"
MISSING_RAWTITLES_FN = DATA_DIR + "missing_rawtitles.txt"
MISSING_RAWTITLES_ED_FN = DATA_DIR + "missing_rawtitles_editdistance.txt"
RAWTITLE_TO_S2TITLE_FN = DATA_DIR + "rawtitle_to_s2title.json"
RAWTITLE_TO_S2ID_FN = DATA_DIR + "rawtitle_to_s2id.json"

MISSING_RAWTITLES_ED_IDS_FN = DATA_DIR + "missing_rawtitles_editdistance_ids.txt"

s2id_to_s2paper = s2_data.get_dict_gA()

with open(MISSING_RAWTITLES_ED_FN, "r+") as file:
    found_rawtitles = [line.strip() for line in file]
with open(MISSING_RAWTITLES_ED_IDS_FN, "r+") as file:
    found_s2ids = [line.strip() for line in file]

rawtitle_to_s2id = u_json.load_json(RAWTITLE_TO_S2ID_FN)
rawtitle_to_s2title = u_json.load_json(RAWTITLE_TO_S2TITLE_FN)

for i in range(len(found_rawtitles)):
    found_rawtitle = found_rawtitles[i]
    found_s2id = found_s2ids[i]
    if found_s2id == "!": continue
    if not found_s2id in s2id_to_s2paper:
        print("unfound:", found_s2id)
        continue
예제 #11
0
import utils.strings as util_str
import conferences.conferences as conf_data
import semantic_scholar.s2data as s2_data
from tqdm import tqdm

VERSION = 2

source_fn = "finding_missing/find_missing_papers_result.txt"
target_fn = "finding_missing/find_missing_papers_result_{0}.txt".format(VERSION)

known        = s2_data.get_dict_gA()
known_titles = [ p["title"] for p in known.values() ]
title_to_id  = { p["title"] : p_id for p_id, p in known.items() }

# DONE
# conferences  = list(conf_data.get_each_conference_papers())
# papers       = sum([ conf_papers for conf_name, conf_papers in conferences ], [])
# paper_titles = [ p["title"] for p in papers ]

with open(source_fn) as file:
  paper_titles = [ line.strip() for line in file ]

match_threshold = 5
def is_match(s1, s2):
  return s1.lower() == s2.lower()
  # return or util_str.editDistance(s1, s2) <= match_threshold

def is_known(paper_title):
  if paper_title in known_titles: return True
  return any([
      is_match(paper_title, known_title)