import os import resource import sys import time import pdb sys.path.append("../utils") sys.path.append("../ringo-engine-python") import ringo import snap import testutils if __name__ == '__main__': if len(sys.argv) < 2: print "Usage: " + sys.argv[0] + " <srcfile>" sys.exit(1) srcfile = sys.argv[1] ringo = ringo.Ringo() t = testutils.Timer() r = testutils.Resource() table = ringo.Load(srcfile) t.show("load bin") r.show("__loadbin__") ringo.Select(table, "Src < 10000") t.show("selected < 10K in place") r.show("__selectedlt10Kinplace__")
import sys import time import pdb sys.path.append("../utils") sys.path.append("../ringo-engine-python") import ringo import snap import testutils if __name__ == '__main__': if len(sys.argv) < 2: print "Usage: " + sys.argv[0] + " <srcfile>" sys.exit(1) srcfile = sys.argv[1] ringo = ringo.Ringo() t = testutils.Timer() r = testutils.Resource() table = ringo.Load(srcfile) t.show("load bin") r.show("__loadbin__") table2 = ringo.Select(table, "Src < 10000", False) # Ringo? table2 = snap.TTable.New(table.GetSchema(), context) t.show("selected < 10K new table") r.show("__selectedlt10Knewtable__")
dstfile: destination file for saving the pagerank table""" sys.exit(1) srcfile = sys.argv[1] dstfile = sys.argv[2] ringo = ringo.Ringo() t = testutils.Timer() r = testutils.Resource() S = [("Id", "int"), ("OwnerUserId", "int"), ("AcceptedAnswerId", "int"), ("CreationDate", "string"), ("Score", "int"), ("Tag", "string")] table = ringo.LoadTableTSV(S, srcfile) t.show("load posts text") .show("__loadpoststext__") questions = ringo.Select(table, "Tag = 'python'", False) t.show("selected tag = 'python'") r.show("__selectedtagpython__") qa = ringo.Join(questions, table, "AcceptedAnswerId", "Id") graph = ringo.ToGraph(qa, "OwnerUserId-2", "OwnerUserId-1") t.show("join") r.show("__join__") t.show("graph") r.show("__graph__") prtable = ringo.PageRank(graph, 'PageRank', False, 0.85, 1e-4, 100) t.show("pagerank") r.show("__pagerank__")
("pullid", "int"), ("status", "string"), ("created_at", "int")] Tpull = ringo.LoadTableTSV(S3, file_cache[TPULL]) t.show("load pull") Tfork = ringo.LoadTableTSV(S2, file_cache[TFORK]) t.show("load fork") Twatch = ringo.LoadTableTSV(S2, file_cache[TWATCH]) t.show("load watch") # If (u,v) collaborated on the same repository - determined by the owner, name pair, # are added as collaborators. #TODO Better column renaming Tcollab_merge = ringo.SelfJoin(Tcollab, "owner") ringo.Select(Tcollab_merge, "2_1.name = 2_2.name", True) ringo.ColMin(Tcollab_merge, "2_1.created_at", "2_2.created_at", "created_at") ringo.Project(Tcollab_merge, ("2_1.userid", "2_2.userid", "created_at")) ringo.Rename(Tcollab_merge, "2_1.userid", "userid1") ringo.Rename(Tcollab_merge, "2_2.userid", "userid2") t.show("merge collab", Tcollab_merge) # If (u,v) worked on the same pull request on the same repository, they are added # as (soft) collaborators. Tpull_merge = ringo.SelfJoin(Tpull, "owner") ringo.Select(Tpull_merge, "3_1.name = 3_2.name", True) ringo.Select(Tpull_merge, "3_1.pullid = 3_2.pullid", True) ringo.ColMin(Tpull_merge, "3_1.created_at", "3_2.created_at", "created_at") ringo.Project(Tpull_merge, ("3_1.userid", "3_2.userid", "created_at")) ringo.Rename(Tpull_merge, "3_1.userid", "userid1") ringo.Rename(Tpull_merge, "3_2.userid", "userid2")
t = testutils.Timer(ENABLE_TIMER) # Load posts S = [('PostId', 'int'), ('UserId', 'int'), ('AnswerId', 'int'), ('CreationDate', 'string')] t1 = ringo.LoadTableTSV(S, os.path.join(srcdir, POSTS_FILE)) t.show("load posts", t1) # Load tags S = [('PostId', 'int'), ('Tag', 'string')] t2 = ringo.LoadTableTSV(S, os.path.join(srcdir, TAGS_FILE)) t.show("load tags", t2) # Select ringo.Select(t2, 'Tag = python', CompConstant=True) t.show("select", t2) # Join t3 = ringo.Join(t1, t2, 'PostId', 'PostId') t.show("join", t3) # Join t4 = ringo.Join(t3, t1, "1.AnswerId", "PostId") t.show("join", t4) # Graph graph = ringo.ToGraph(t4, "1_2.1.UserId", "1.UserId") t.show("graph", graph) ringo.ShowMetadata(graph)
sys.exit(1) srcfile = sys.argv[1] dstfile = sys.argv[2] ringo = ringo.Ringo() t = testutils.Timer() r = testutils.Resource() S = [("Id", "int"), ("OwnerUserId", "int"), ("AcceptedAnswerId", "int"), ("CreationDate", "string"), ("Score", "int"), ("Tag", "string")] table = ringo.LoadTableTSV(S, srcfile) t.show("load posts text") r.show("__loadpoststext__") questions = ringo.Select(table, "Tag = 'python'", False) t.show("selected tag = 'python'") r.show("__selectedtagpython__") ringo.Select(questions, "AcceptedAnswerId != 0") t.show("select questions") r.show("__selectquestions__") ringo.Select(table, "AcceptedAnswerId = 0") t.show("select answers") r.show("__selectanswers__") ringo.Join(questions, table, "AcceptedAnswerId", "Id") t.show("join") r.show("__join__")
('CreationDate', 'string')] POSTS = ringo.LoadTableTSV(S, os.path.join(srcdir, POSTS_FILE)) t.show("load posts", POSTS) # Load tags S = [('PostId', 'int'), ('Tag', 'string')] T = ringo.LoadTableTSV(S, os.path.join(srcdir, TAGS_FILE)) t.show("load tags", T) # Join P = ringo.Join(POSTS, T, "PostId", "PostId") t.show("join", P) # Select Java posts print ringo.GetSchema(P) ringo.Select(P, '2.Tag = java', CompConstant=True) t.show("select", P) # Select Questions Q = ringo.Select(P, '1.AnswerId != 0', InPlace=False, CompConstant=True) t.show("select", Q) # Select Answers A = ringo.Select(P, '1.AnswerId = 0', InPlace=False, CompConstant=True) t.show("select", A) #print ringo.DumpTableContent(Q,5) #print ringo.DumpTableContent(A,5) #ringo.GenerateProvenance(Q, '06-StackOverflow-paper-autogen-Q.py') #ringo.GenerateProvenance(A, '06-StackOverflow-paper-autogen-A.py') # Join
import sys import ringo posts_file = sys.argv[1] tags_file = sys.argv[2] ringo = ringo.Ringo() S_posts = [('Id','int'), ('PostTypeId','int'), ('AcceptedAnswerId','int'), ('OwnerUserId','int'), ('Body','string')] S_tags = [('Id','int'), ('Tag','string')] posts = ringo.LoadTableTSV(S_posts, posts_file) ringo.Select(posts, 'OwnerUserId > 0') #print ringo.ringo.DumpTableContent(posts) # buggy... q_tags = ringo.LoadTableTSV(S_tags, tags_file) QT = ringo.Join(posts, q_tags, "Id", "Id") QT = ringo.Project(QT, ['Id', 'PostTypeId', 'AcceptedAnswerId', 'OwnerUserId', 'Body', 'Tag']) a_tags = ringo.Project(QT, ['AcceptedAnswerId', 'Tag'], False) AT = ringo.Join(posts, a_tags, 'Id', 'AcceptedAnswerId') AT = ringo.Project(AT, ['Id', 'PostTypeId', 'AcceptedAnswerId', 'OwnerUserId', 'Body', 'Tag']) T = ringo.UnionAll(QT, AT) ringo.SaveTableTSV(T, 'so_posts.tsv')
import ringo import sys src_file = sys.argv[1] Schema = [('Id', 'int'), ('PostTypeId', 'int'), ('AcceptedAnswerId', 'int'), ('OwnerUserId', 'int'), ('Body', 'string'), ('Tag', 'string')] ringo = ringo.Ringo() P = ringo.LoadTableTSV(Schema, src_file, '\t', True) ringo.Project(P, ['Id', 'PostTypeId', 'AcceptedAnswerId', 'OwnerUserId', 'Tag']) JP = ringo.Select(P, "Tag = 'java'", False) Q = ringo.Select(JP, 'PostTypeId = 1', False) A = ringo.Select(JP, 'PostTypeId = 2', False) QA = ringo.Join(Q, A, 'AcceptedAnswerId', 'Id') G = ringo.ToGraph(QA, 'OwnerUserId-1', 'OwnerUserId-2') PR_MAP = ringo.PageRank(G) # a hash map object: node/user id -> PageRank score PR = ringo.TableFromHashMap(PR_MAP, 'user', 'score') PR = ringo.Order(PR, ['score']) ringo.SaveTableTSV(PR, 'scores.tsv') #ringo.SaveTableBinary(PR, 'scores') ringo.GenerateProvenance(G, 'G.py')